diff --git a/.agents/skills/guidelines.md b/.agents/skills/guidelines.md new file mode 100644 index 00000000..d0d0b59d --- /dev/null +++ b/.agents/skills/guidelines.md @@ -0,0 +1,64 @@ +--- +name: karpathy-guidelines +description: Behavioral guidelines to reduce common LLM coding mistakes. Use when writing, reviewing, or refactoring code to avoid overcomplication, make surgical changes, surface assumptions, and define verifiable success criteria. +license: MIT +--- + +Tradeoff: These guidelines bias toward caution over speed. For trivial tasks, use judgment. + +1. Think Before Coding + +Don't assume. Don't hide confusion. Surface tradeoffs. + +Before implementing: + + State your assumptions explicitly. If uncertain, ask. + If multiple interpretations exist, present them - don't pick silently. + If a simpler approach exists, say so. Push back when warranted. + If something is unclear, stop. Name what's confusing. Ask. + +2. Simplicity First + +Minimum code that solves the problem. Nothing speculative. + + No features beyond what was asked. + No abstractions for single-use code. + No "flexibility" or "configurability" that wasn't requested. + No error handling for impossible scenarios. + If you write 200 lines and it could be 50, rewrite it. + +Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. +3. Surgical Changes + +Touch only what you must. Clean up only your own mess. + +When editing existing code: + + Don't "improve" adjacent code, comments, or formatting. + Don't refactor things that aren't broken. + Match existing style, even if you'd do it differently. + If you notice unrelated dead code, mention it - don't delete it. + +When your changes create orphans: + + Remove imports/variables/functions that YOUR changes made unused. + Don't remove pre-existing dead code unless asked. + +The test: Every changed line should trace directly to the user's request. +4. Goal-Driven Execution + +Define success criteria. Loop until verified. + +Transform tasks into verifiable goals: + + "Add validation" → "Write tests for invalid inputs, then make them pass" + "Fix the bug" → "Write a test that reproduces it, then make it pass" + "Refactor X" → "Ensure tests pass before and after" + +For multi-step tasks, state a brief plan: + +1. [Step] → verify: [check] +2. [Step] → verify: [check] +3. [Step] → verify: [check] + +Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification. diff --git a/.gitea/scripts/resolve-release-version.sh b/.gitea/scripts/resolve-release-version.sh new file mode 100755 index 00000000..297aafe3 --- /dev/null +++ b/.gitea/scripts/resolve-release-version.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Resolve the release version for a per-crate release workflow. +# +# Usage: +# resolve-release-version.sh [manual-version] +# +# Inputs are positional so callers can plug it from any CI without +# environment-variable contracts: +# - tag-prefix: e.g. "harmony-fleet-operator-" (NO trailing v) +# - ref-name: e.g. "harmony-fleet-operator-v0.1.0" (push-tag case) +# - manual-version: optional; takes precedence over ref parsing +# (workflow_dispatch case) +# +# Prints the resolved version (e.g. "v0.1.0") to stdout; exits non-zero +# with a message to stderr if neither input yields one. +# +# Interim: this should eventually live in a harmony Rust binary that +# understands git refs natively. See PR discussion on +# .gitea/workflows/harmony-fleet-operator.yaml. + +set -euo pipefail + +PREFIX="${1:?usage: resolve-release-version.sh [manual-version]}" +REF="${2:?usage: resolve-release-version.sh [manual-version]}" +MANUAL="${3-}" + +if [ -n "$MANUAL" ]; then + VERSION="$MANUAL" +else + VERSION="${REF#${PREFIX}}" +fi + +if [ -z "$VERSION" ] || [ "$VERSION" = "$REF" ]; then + echo "could not resolve version from ref '$REF' (prefix '$PREFIX', manual '$MANUAL')" >&2 + exit 1 +fi + +echo "$VERSION" diff --git a/.gitea/workflows/harmony-fleet-operator.yaml b/.gitea/workflows/harmony-fleet-operator.yaml index 140d8c83..d9dbbc95 100644 --- a/.gitea/workflows/harmony-fleet-operator.yaml +++ b/.gitea/workflows/harmony-fleet-operator.yaml @@ -1,12 +1,19 @@ -name: Build and push harmony-fleet-operator image +name: Release harmony-fleet-operator (image + chart) on: push: - branches: - - master + tags: + # Per-crate release tag. One tag → one image + one chart, both + # at the same version. Format: `harmony-fleet-operator-v0.1.0`. + - 'harmony-fleet-operator-v*' workflow_dispatch: + inputs: + version: + description: 'Version tag to release (e.g. v0.1.0). Required for manual runs.' + required: true + type: string jobs: - build_and_push: + release: container: image: hub.nationtech.io/harmony/harmony_composer:latest runs-on: dind @@ -14,7 +21,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Log in to hub.nationtech.io + - name: Log in to hub.nationtech.io (docker) uses: docker/login-action@v3 with: registry: hub.nationtech.io @@ -24,21 +31,39 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - # Build context is the workspace root because the operator's - # Cargo.toml has `path = "../../harmony"` deps. The multi-stage - # Dockerfile runs `cargo build` itself inside a pinned rust - # image, so no host-side cargo step is needed. + # helm is not in harmony_composer:latest at time of writing; pull + # the official installer. One-shot, no apt source needed. # - # TODO: add buildx layer caching. Each run currently recompiles - # the whole `harmony` workspace from scratch in the builder - # stage. Add `cache-from: type=gha` + `cache-to: type=gha,mode=max` - # below once build time becomes the bottleneck. If layer cache - # alone isn't enough, consider splitting the Dockerfile with - # cargo-chef (no other crate in this repo does that yet). - - name: Build and push - uses: docker/build-push-action@v6 - with: - context: . - file: fleet/harmony-fleet-operator/Dockerfile - push: true - tags: hub.nationtech.io/harmony/harmony-fleet-operator:latest + # TODO: bake helm into harmony_composer so this step disappears. + - name: Install helm + run: | + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + + - name: Log in to hub.nationtech.io (helm OCI) + run: | + echo "${{ secrets.HUB_BOT_PASSWORD }}" \ + | helm registry login hub.nationtech.io \ + --username "${{ secrets.HUB_BOT_USER }}" \ + --password-stdin + + - name: Resolve version + id: ver + run: | + VERSION=$(.gitea/scripts/resolve-release-version.sh \ + "harmony-fleet-operator-" "$GITHUB_REF_NAME" "${{ inputs.version }}") + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + + # Same script a developer would run from their laptop in an + # outage. All build logic lives in Rust under + # fleet/harmony-fleet-deploy; CI is just a thin trigger. + # + # TODO (carried over from the previous workflow): add buildx + # layer caching. Each run currently recompiles the whole + # `harmony` workspace from scratch in the Dockerfile's builder + # stage. cargo-chef + `cache-from: type=gha` would help once + # build time becomes the bottleneck. + - name: Build and push image + chart + run: | + ./fleet/harmony-fleet-operator/release.sh \ + hub.nationtech.io \ + "${{ steps.ver.outputs.version }}" diff --git a/AGENTS.md b/AGENTS.md index 43847e1f..3eae161d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,10 @@ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +## Read First — Engineering Guidelines + +**[.agents/skills/guidelines.md](.agents/skills/guidelines.md) is mandatory reading before any code change.** It encodes the bar this codebase holds — Think Before Coding, Simplicity First, Surgical Changes, Goal-Driven Execution. The cardinal sins flagged in past reviews: rebuilding modules that already exist instead of composing with them, per-component scaffolding when a function or macro would do, inlined bash in YAML workflows, and comment density that drowns the code. If you write 200 lines and a senior would call it overcomplicated, rewrite it. + ## Build & Test Commands ```bash diff --git a/Cargo.lock b/Cargo.lock index b27c5d2b..97d0345a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4107,6 +4107,7 @@ dependencies = [ "tokio", "toml", "tracing", + "tracing-subscriber", ] [[package]] diff --git a/ROADMAP/fleet_platform/dashboard_ingress.md b/ROADMAP/fleet_platform/dashboard_ingress.md new file mode 100644 index 00000000..eb278e14 --- /dev/null +++ b/ROADMAP/fleet_platform/dashboard_ingress.md @@ -0,0 +1,92 @@ +# Fleet operator dashboard — make shippable and expose via Ingress + +## Context + +The operator binary has a server-side dashboard (axum + Maud + HTMX +under `fleet/harmony-fleet-operator/src/frontend/`), but it is **not +shippable today**. The k3d smoke-test of the release pipeline made +this concrete: the chart correctly omits any `Service` or `Ingress` +because there is no production-ready dashboard endpoint to point them +at. Three blockers, in order of dependency. + +## Work to be done + +### 1. Build the production image with the dashboard included + +- [ ] Update `fleet/harmony-fleet-operator/Dockerfile` to build with + `--features web-frontend` (currently + `cargo build --release --locked -p harmony-fleet-operator`, + no features). +- [ ] Confirm Tailwind CSS is embedded at build time inside the + builder stage. The crate doc says the CSS is embedded when + `tailwindcss` is on PATH at build time, otherwise the bundle is + empty and `--css-from` must be passed at runtime. Decide: ship + with embedded CSS (install `tailwindcss` in the builder stage) + or document the empty-bundle path. +- [ ] Confirm the build still satisfies the cross-compile gating + added in PR #291 (`ci: fix Windows cross-compile by gating + unix-only harmony code`) — the `web-frontend` feature must not + pull in unix-only code on Windows targets if Windows is still a + CI target. + +### 2. Replace the mock-only `serve-web` with a real implementation + +- [ ] Implement `FleetService` against the real NATS + Kubernetes + backend (the operator currently uses + `MockFleetService::default()` and bails when `--mock` is + not passed: `main.rs:125` — `"serve-web without --mock is not + implemented yet (real FleetService impl pending)"`). +- [ ] Decide the runtime topology: does the controller and the web + server share a Pod and a process? Two containers in one Pod? + Two separate Deployments? Current code suggests "same process, + different subcommand"; the chart will need to be updated + whichever way it goes. +- [ ] Wire the Zitadel auth env vars (`FLEET_AUTH_*` from `dev.sh`) + through the chart's Pod env. These are + operator-environment-specific (like the existing + `FLEET_OPERATOR_CREDENTIALS_TOML` Secret) and should likely + stay out of the redistributable chart, mounted by the deploy + pipeline. +- [ ] Decide on the `FLEET_OPERATOR_COOKIE_KEY_B64` lifecycle: + operator-generated on first boot? Deploy-time secret? Document. + +### 3. Expose the dashboard via Service + Ingress in the chart + +- [ ] Add a `Service` resource to `chart.rs` (ClusterIP, target port + 18080 to match the default `serve-web --addr`). +- [ ] Add an `Ingress` resource. Open questions: + - Ingress class: assume `traefik` (k3d default)? Make it + configurable via `ChartOptions`? + - Host: configurable via `ChartOptions` (e.g., + `fleet.my-cluster.example.com`); no sensible default. + - TLS: cert-manager `ClusterIssuer` reference, or expect TLS to be + terminated upstream? Probably a `ChartOptions.tls_issuer: + Option` knob — `None` means "no TLS section on the + Ingress." +- [ ] Decide whether the Ingress is in scope for the chart at all, + or whether it should live in a separate `*-ingress` chart that + the deploy layer composes. The first path is simpler; + the second matches "small composable Scores" from ADR-023. +- [ ] Smoke-test on k3d: install the chart, `curl` the dashboard + through the k3d LoadBalancer, confirm HTTP 200 and the page + renders. + +## Out of scope here + +- Decisions about who hosts the dashboard's auth (Zitadel-only or + multi-IdP) — that's a product question, not a chart question. +- Operator HA. The current chart is `replicas: 1`. Multi-replica + needs leader election in the controller, which is its own work. +- Dashboard observability (metrics endpoint, structured access + logs) — fold in when adding the Service. + +## Why this lives in its own roadmap + +These three items are dependency-chained (1 → 2 → 3) and each is +non-trivial. Bundling them with the CI release pipeline would couple +unrelated risks and make the PR un-reviewable. Keeping this file +unnumbered (per +[`ROADMAP/fleet_platform/v0_1_plan.md`](v0_1_plan.md) and +[`v0_2_plan.md`](v0_2_plan.md) — numbered files are versioned +milestones) signals that this is a free-floating workstream that +slots into whichever milestone picks it up. diff --git a/docs/adr/drafts/012-1-release-architecture.md b/docs/adr/drafts/012-1-release-architecture.md new file mode 100644 index 00000000..5cec8292 --- /dev/null +++ b/docs/adr/drafts/012-1-release-architecture.md @@ -0,0 +1,390 @@ +# ADR-012 follow-up: Release Architecture Mechanism (Capability-Driven Build, Package, Push) + +Initial Author: Jean-Gabriel Gill-Couture (drafted with Claude) + +Initial Date: 2026-05-27 + +Last Updated Date: 2026-05-27 + +## Status + +**Draft. Clarification + mechanism for ADR-012, not a separate +decision.** ADR-012 (Project Delivery Automation, 2025-06-04) +locked the *intent*: application-level Scores (LAMPScore-style) +drive an opinionated pipeline — *empty-check → package&publish → +deploy staging → sanity-check → deploy production → sanity-check* — +with CD tools (Argo / Flux) activated by default and called via +API; the same `harmony` binary runs the project's module locally +and in CI. + +This document supplements that intent with the concrete *mechanism*: +how steps 2, 3, 5 (package&publish, deploy staging, deploy +production) actually compile and execute against today's +Score-Topology-Capability primitives, and what to delete from the +two interim attempts (`modules/application` + the per-component +`harmony-fleet-release` binary) when the mechanism lands. + +Companion references: +- **ADR-023 (Deploy Architecture, Accepted)** — the `DeployScore` + proposed here plugs into the `harmony_cli::run` flow and + `*-deploy` crate pattern ADR-023 mandates; the smoke-test + contract (ADR-023 principle 4) is how steps 4 and 6 of ADR-012 + block. +- **ADR-024 draft (Fleet Platform Capability Decomposition)** — + same decomposition shape, applied to release: framework-level + capability traits, not per-app methods. +- **ADR-003** — capability traits represent industry concepts, not + tools. Every new capability here passes the swap-out test. +- **ADR-018** — template hydration. `ChartSource::Builder` keeps + the chart fully typed up to package time. + +Open questions remain in the *Open questions* section below; they +are deliberately left open for follow-up clarifications to bolt on +the same way this one bolts on to ADR-012. + +## Context + +ADR-012's intent never got a concrete mechanism beyond `LAMPScore` +and the (later-abandoned) `modules/application` parallel hierarchy. +As a result, today every app harmony hosts re-implements its own +release pipeline: + +- A ~270-line per-component release binary (`fleet_release.rs`). +- A per-component `release.sh` wrapper script. +- A per-component `.gitea/workflows/-.yaml`. +- Hand-written chart hydration (`fleet/harmony-fleet-deploy/src/operator/chart.rs`). + +PR review (#301) flagged this directly: *"we should not have to +rebuild a new cli for every component of every app using harmony"* +and *"the burden of designing the process and cli should not be on +the user. Fleet is the equivalent of a user."* The framework owns +deploy (ADR-023); release is the missing half of ADR-012. + +A naive fix — "add a `define_release!` macro so each component is +one declarative invocation" — solves the duplication but leaves the +mechanism wrong. Building, packaging, and pushing are not app +concerns. They are *capabilities* that an environment provides (a +docker daemon, an OCI registry, a CI runner), the same way DNS, +LoadBalancer, or PostgreSQL are. + +This ADR pulls release into the Score-Topology-Capability pattern +that already governs deploy. + +## Two prior attempts and what each got right + +### Attempt 1 — `modules/application` (the `RustWebapp` example, `ApplicationScore`, `ApplicationFeature`) + +`examples/rust/src/main.rs` shows what good DX looks like: + +```rust +let application = Arc::new(RustWebapp { + name: "harmony-example-rust-webapp".to_string(), + project_root: PathBuf::from("./webapp"), + framework: Some(RustWebFramework::Leptos), + service_port: 3000, + .. +}); + +let app = ApplicationScore { + features: vec![ + Box::new(PackagingDeployment { application: application.clone() }), + Box::new(Monitoring { application: application.clone(), alert_receiver: vec![...] }), + ], + application, +}; +``` + +**What it got right:** + +| Property | Why it's right | +|---|---| +| Declarative app spec at the call site | One struct, ~15 lines, describes the whole thing | +| Per-app type implements capability traits (`OCICompliant`, `HelmPackage`, `Webapp`) | Compiler rejects nonsensical combos — features state their bounds (`impl ApplicationFeature for PackagingDeployment`) | +| Features compose orthogonally | `vec![PackagingDeployment, Monitoring, …]` extensible without touching the app | +| One feature does one thing | Small, testable, replaceable | + +**What it got wrong:** + +| Anti-pattern | Symptom | +|---|---| +| Parallel hierarchy | `Application` / `ApplicationFeature` / `ApplicationInterpret` / `ApplicationScore` live alongside `Score` / `Interpret` — two ways to express the same thing | +| App-owned capabilities | `OCICompliant::build_push_oci_image()` is a method on the app — the app *does* the build. Should be: topology *provides* a builder, score *uses* it | +| `MultiTargetTopology` + `DeploymentTarget::LocalDev / Production` | Inline `match topology.current_target() { LocalDev => …, _ => Argo… }` in `PackagingDeployment::ensure_installed`. The author's own comment: *"It still does not feel right though."* | +| ArgoCD hardcoded as the production mechanism | Should be a `ContinuousDelivery` capability; OPNsense vs CoreDNS distinction (ADR-003) applies — Argo today, Flux or something else tomorrow | + +### Attempt 2 — `harmony-fleet-release` binary + `release.sh` (the current branch) + +After abandoning `modules/application`'s parallel hierarchy, fleet +went the other direction: hand-rolled per-component binaries, no +framework-level abstraction. Solved the parallel-hierarchy problem +by removing the hierarchy entirely; created a 270-line per-component +duplication problem in its place. + +**Net diagnosis:** attempt 1 had the right *shape* (declarative +app + composable features + capability bounds), wrong *mechanism* +(parallel layer, app-side capabilities). Attempt 2 had the right +mechanism (use the existing Score/Topology primitives) but no +abstraction. The fix is to keep attempt 1's shape and express it +in attempt 2's mechanism. + +## Mechanism (proposed) + +**A release is a Score driven by Topology capabilities, exactly like +a deploy.** No parallel hierarchy. No per-component CLI binaries. No +app-side build/push methods. ADR-012's opinionated lifecycle pipeline +becomes a composed `Vec>>` — each pipeline step is +one Score, harmony executes them in order, each blocks on the +smoke-test contract (ADR-023, step 4). + +### Capabilities (topology-side) + +New capability traits, alongside `DnsServer`, `HelmCommand`, etc.: + +```rust +pub trait ContainerBuilder { + async fn build(&self, ctx: &BuildContext, image_ref: &str) -> Result<(), Error>; +} + +pub trait OciRegistry { + async fn push_image(&self, image_ref: &str) -> Result<(), Error>; + fn registry_url(&self) -> &str; +} + +pub trait HelmRegistry { + async fn push_chart(&self, tgz: &Path, project: &str) -> Result<(), Error>; +} + +pub trait ContinuousDelivery { + async fn sync_to(&self, chart_ref: &ChartRef, target: &DeployTarget) -> Result<(), Error>; +} +``` + +Each capability is an industry concept (per ADR-003 rule). Adapters: + +| Capability | LocalDev provider | CI provider | Production provider | +|---|---|---|---| +| `ContainerBuilder` | local docker daemon | gitea runner docker | remote buildkit | +| `OciRegistry` | local registry, or noop (k3d image-import) | hub.nationtech.io | hub.nationtech.io | +| `HelmRegistry` | noop (local helm) | hub.nationtech.io | hub.nationtech.io | +| `ContinuousDelivery` | direct helm install | direct helm install | ArgoCD `ArgoHelmScore` | + +`K8sAnywhereTopology` composes these capabilities the way it already +composes `K8sclient + HelmCommand + Ingress + …`. A CI-runner +topology variant composes the same caps with CI-flavored adapters. + +### Application spec (data only) + +```rust +pub struct AppSpec { + pub name: &'static str, + pub project_root: PathBuf, + pub image: ImageSource, // Dockerfile path + build args, OR a `Buildable` impl that emits one + pub chart: ChartSource, // PathBuf to a directory, OR a `ChartBuilder` closure / impl returning HelmChart +} +``` + +No methods that *do* work. The spec is pure data. Methods (`build`, +`push`, `package`) live on the topology capabilities. This is the +shift that fixes attempt 1: `RustWebapp` no longer implements +`OCICompliant::build_push_oci_image()`; instead, `AppSpec` describes +*what* to build, and `ContainerBuilder` (on the topology) does it. + +### Scores + +```rust +pub struct ReleaseScore { app: Arc, version: String } +pub struct DeployScore { app: Arc, version: String, target: DeployTarget } + +impl Score for ReleaseScore { ... } +impl Score for DeployScore { ... } +``` + +The trait bounds *are* the spec of what a topology must provide. +Compile-time guarantee: you cannot construct `ReleaseScore` against +a topology that can't release. + +### The opinionated pipeline (ADR-012 realized) + +ADR-012's pipeline maps directly onto a Score sequence: + +| ADR-012 step | Mechanism | +|---|---| +| 1. Empty check | Out of scope for harmony (matches ADR-012). Project's own CI step runs ahead of `harmony-` invocation. | +| 2. Package & publish | `ReleaseScore` — builds image, hydrates chart, pushes both. | +| 3. Deploy to staging | `DeployScore { target: DeployTarget::Staging }` — talks to the topology's `ContinuousDelivery` (Argo by default per ADR-012). | +| 4. Sanity check staging | Smoke-test companion on the `DeployScore` (ADR-023 principle 4). | +| 5. Deploy to production | Same `DeployScore` with `DeployTarget::Production`. CI gates approval; harmony just runs the score. | +| 6. Sanity check production | Same smoke-test companion against production. | + +### Call site (after the migration) + +```rust +let app = Arc::new(AppSpec { + name: "harmony-fleet-operator", + project_root: ".".into(), + image: ImageSource::Dockerfile("fleet/harmony-fleet-operator/Dockerfile".into()), + chart: ChartSource::Builder(Box::new(operator::chart::build)), +}); + +let scores: Vec>> = vec![ + Box::new(ReleaseScore { app: app.clone(), version: from_tag(ref_name)? }), + Box::new(DeployScore { app: app.clone(), version, target: DeployTarget::Staging }), + Box::new(DeployScore { app: app.clone(), version, target: DeployTarget::Production }), +]; + +harmony_cli::run(Inventory::autoload(), CITopology::from_env(), scores, None).await? +``` + +**That's the user-facing surface for any app.** It matches ADR-012's +"run the same command anywhere" promise — same binary, local laptop +or CI runner; topology adapters differ, Scores don't. No per-component +release binary. No `release.sh` per component. Adding a component is +one new `AppSpec` value. + +### CI integration + +Workflow yaml shrinks to: + +```yaml +- name: Release + run: | + harmony-fleet --release --from-tag "$GITHUB_REF_NAME" +``` + +The `--from-tag` flag is implemented inside `harmony_release` (Rust, +not bash). It parses `--v` against the +registered `AppSpec`s and constructs `ReleaseScore { app, version }` +for the matching one. Tag malformed → fail at CI start, not three +minutes into the docker build. + +`.gitea/scripts/resolve-release-version.sh` (just shipped) is the +interim form; the script disappears once `--from-tag` lands. + +### What replaces the `modules/application` layer + +`Application` / `ApplicationFeature` / `ApplicationInterpret` / +`ApplicationScore` are deleted. Migration path: + +| Old | New | +|---|---| +| `RustWebapp` (struct implementing capabilities) | `AppSpec { name, project_root, image: ImageSource::Buildable(Box::new(RustLeptosBuilder)), chart: … }` | +| `OCICompliant::build_push_oci_image` | `ContainerBuilder::build` on the topology (Score calls it) | +| `HelmPackage::build_push_helm_package` | `HelmRegistry::push_chart` on the topology (Score calls it) | +| `PackagingDeployment` (ApplicationFeature) | `ReleaseScore + DeployScore` (regular Scores) | +| `Monitoring` (ApplicationFeature) | `MonitoringScore` (regular Score) | +| `ApplicationScore` wrapping features | `Vec>>` passed to `harmony_cli::run` | + +The `RustWebapp` Leptos Dockerfile generator survives — it becomes a +`Buildable` impl that emits a `Dockerfile` for `ImageSource::Buildable` +to use. Same code, different home. + +## Alternatives considered + +### Macro per component — `define_release!(Component::Operator { … })` + +Solves the duplication. Still leaves build/push as app-side work, +still gives every app its own CLI surface. Doesn't address the +capability gap. **Rejected.** + +### Keep `modules/application` as-is, add a release feature + +Adds a third release variant to the existing parallel hierarchy. +Doubles down on the wrong mechanism. **Rejected.** + +### Plugin-discovery `harmony` top-level binary (ADR-023 "tomorrow C") + +Orthogonal to this clarification. Plugin discovery is *how the +binaries are invoked*; this is *what the binaries do*. Compatible: +each app ships a `harmony-` plugin binary that registers its +`AppSpec`s with `harmony_release::cli`. + +### Promote to a fresh ADR-025 + +The shape proposed here doesn't disagree with ADR-012 — it +*implements* it. A new ADR would invite reading them as competing +decisions and split the conversation in two. Keeping this as +ADR-012's first clarification (012-1) keeps the lineage explicit. +**Rejected for now;** revisit if a future decision genuinely +diverges from ADR-012's intent. + +## Consequences + +### Positive + +- ADR-012 intent finally has a mechanism. The opinionated pipeline + becomes a composed `Vec>>` — concrete, testable, + identical local and in CI. +- One framework, one mechanism. Release joins deploy in the + Score-Topology-Capability pattern. No second concept to learn. +- Capability-driven swap-out. Move from docker→podman, Harbor→ECR, + helm→ko, Argo→Flux without touching any app's `AppSpec`. ADR-003 + rule held. +- ~250 lines deleted from each app (per-component bin + release.sh + + workflow yaml duplication). +- Adding a component is one `AppSpec` value, not a new binary. +- LocalDev vs CI vs Production is *topology selection at runtime* + (ADR-023 principle 6), not branching inside a Score. + +### Negative / costs + +- Migration of `modules/application` and the `RustWebapp` example to + the new shape is a non-trivial PR (preserve the Leptos Dockerfile + generation as a `Buildable` impl). +- New crate `harmony_release` (or new module inside `harmony` core). + Sizing TBD during implementation; target the same ~ADR-023 scope. +- Adapter coverage: at least three `ContainerBuilder` adapters + (local docker, CI docker, k3d-image-import) and two `OciRegistry` + / `HelmRegistry` adapters (hub.nationtech.io, noop) need to land + for the first migration to be useful. + +### Risks to watch + +- **Capability surface creep.** Resist adding a fourth or fifth + capability for every minor variant (e.g. `MultiArchContainerBuilder`, + `SignedOciRegistry`). Extend existing capabilities first; create + new ones only when the swap-out test (ADR-003) holds. +- **Build context size.** `BuildContext` could absorb every flag + ever needed (`build_args`, `secrets`, `platforms`, `targets`, + `cache_from`, …). Start small; grow with documented justification. +- **`Buildable` trait explosion.** Per-language Dockerfile generators + (`RustLeptos`, `Python`, `Node`, …) belong as `Buildable` impls, + not as variants on `AppSpec`. Resist adding them to core until at + least two callers need each one. + +## Implementation order (separate PRs, each shippable) + +1. **`harmony_release` crate** — `AppSpec`, `ImageSource`, + `ChartSource`, `ReleaseScore`, capability trait stubs, `--from-tag` + parsing. No adapters yet. +2. **Local + CI capability adapters** — `LocalDockerBuilder`, + `CiDockerBuilder`, `HarborOciRegistry`, `HarborHelmRegistry`. + Compose into `K8sAnywhereTopology` and a new `CiTopology`. +3. **Migrate fleet operator release** to `AppSpec` + `ReleaseScore`. + Delete `harmony-fleet-release` binary, `release.sh`, + `.gitea/scripts/resolve-release-version.sh`. Workflow yaml goes to + the ~15-line form. Validates the framework on a real app. +4. **Add fleet agent + callout** as `AppSpec` entries when their + pipelines land. One-line additions, no scaffolding. +5. **Migrate `modules/application` `RustWebapp` example** to the new + shape. Preserves the Leptos Dockerfile generator. Deletes the + parallel `Application` / `ApplicationFeature` hierarchy. +6. **(Stretch) `ContinuousDelivery` capability** as a real abstraction + above the current `ArgoHelmScore`. Enables Flux / direct-helm as + peer providers. Out of v1 scope; ADR remains compatible. + +## Open questions + +- **Chart hydration genericism.** Should `ChartSource::FromScore` exist? + The deploy Score's resources are a near-superset of what the chart + contains. If we get this right, `ChartSource::Builder` becomes the + exception, not the rule. Worth scoping in #5 or later. +- **`DeployTarget` shape.** Today fleet uses `(namespace, image_tag)`. + A typed enum (`InCluster { namespace, … }`, `MultiCluster { … }`, + `EdgeDevice { device_id, … }`) may capture more. Defer to the + agent-CD work (manual upgrades per device) where this gets exercised. +- **Image identity.** The image's full ref (`//:`) + is currently built by string concatenation in three places. Should + there be an `ImageRef` type with parse/display impls? Probably; add + in #1 if cost is small. +- **Buildable for non-Rust languages.** Out of scope until a non-Rust + app needs it, then add by demand. diff --git a/examples/fleet_e2e_demo/src/lib.rs b/examples/fleet_e2e_demo/src/lib.rs index 81b4199c..71790221 100644 --- a/examples/fleet_e2e_demo/src/lib.rs +++ b/examples/fleet_e2e_demo/src/lib.rs @@ -675,10 +675,10 @@ key_json = """ output_dir: PathBuf::new(), // unused on this code path image: OPERATOR_IMAGE_TAG.to_string(), image_pull_policy: "IfNotPresent".to_string(), - namespace: OPERATOR_NAMESPACE.to_string(), nats_url: format!("nats://{NATS_RELEASE}.{NATS_NAMESPACE}.svc.cluster.local:4222"), log_level: "info,kube_runtime=warn".to_string(), credentials: Some(OperatorCredentials { credentials_toml }), + chart_version: None, }; // CRDs first — the operator watches them on startup. @@ -693,7 +693,7 @@ key_json = """ // RBAC. K8sResourceScore::single( - build_service_account(&opts), + build_service_account(), Some(OPERATOR_NAMESPACE.to_string()), ) .interpret(&Inventory::autoload(), topology) @@ -705,7 +705,7 @@ key_json = """ .await .context("operator ClusterRole apply")?; - K8sResourceScore::single(build_cluster_role_binding(&opts), None) + K8sResourceScore::single(build_cluster_role_binding(OPERATOR_NAMESPACE), None) .interpret(&Inventory::autoload(), topology) .await .context("operator ClusterRoleBinding apply")?; diff --git a/fleet/harmony-fleet-deploy/Cargo.toml b/fleet/harmony-fleet-deploy/Cargo.toml index da44b0c6..2ed8ffe8 100644 --- a/fleet/harmony-fleet-deploy/Cargo.toml +++ b/fleet/harmony-fleet-deploy/Cargo.toml @@ -16,6 +16,16 @@ path = "src/lib.rs" name = "harmony-fleet-deploy" path = "src/main.rs" +# Release tool: builds and pushes the image + hydrated helm chart for +# one fleet component (operator today; agent and nats-callout will +# join as their pipelines land). Driven by the per-component +# release.sh wrappers and the .gitea CI workflows. App-scoped (not +# component-scoped) so a single binary covers every fleet component +# behind a `--component` flag. +[[bin]] +name = "harmony-fleet-release" +path = "src/bin/fleet_release.rs" + [dependencies] harmony = { path = "../../harmony", features = ["podman"] } harmony_cli = { path = "../../harmony_cli" } @@ -40,3 +50,4 @@ thiserror = { workspace = true } tokio = { workspace = true, features = ["full"] } toml = { workspace = true } tracing = { workspace = true } +tracing-subscriber = { workspace = true } diff --git a/fleet/harmony-fleet-deploy/src/argo/mod.rs b/fleet/harmony-fleet-deploy/src/argo/mod.rs new file mode 100644 index 00000000..cf830191 --- /dev/null +++ b/fleet/harmony-fleet-deploy/src/argo/mod.rs @@ -0,0 +1,71 @@ +//! Argo `Application` definition for the operator chart. Composed +//! into [`ArgoHelmScore::argo_apps`] by main.rs. + +use harmony::modules::application::features::{ + ArgoApplication, Automated, Backoff, Helm, Retry, Source, SyncPolicy, +}; + +pub const CHART_NAME: &str = "harmony-fleet-operator"; + +/// Build an Argo Application syncing +/// `oci:////harmony-fleet-operator:` +/// into `target_namespace`. +pub fn operator_application( + target_namespace: &str, + registry: &str, + project: &str, + chart_version: &str, +) -> ArgoApplication { + // Helm chart versions are SemVer; the publish pipeline strips the + // leading `v`. Accept either form from the caller. + let version = chart_version + .strip_prefix('v') + .unwrap_or(chart_version) + .to_string(); + ArgoApplication { + name: CHART_NAME.to_string(), + namespace: None, + project: "default".to_string(), + source: Source { + // OCI urls intentionally have no scheme; see Source::repo_url. + repo_url: format!("{registry}/{project}"), + target_revision: Some(version), + chart: CHART_NAME.to_string(), + path: String::new(), + helm: Helm { + pass_credentials: None, + parameters: vec![], + file_parameters: vec![], + release_name: Some(CHART_NAME.to_string()), + value_files: vec![], + ignore_missing_value_files: None, + values: None, + values_object: None, + skip_crds: None, + skip_schema_validation: None, + version: None, + kube_version: None, + api_versions: vec![], + namespace: Some(target_namespace.to_string()), + }, + }, + sync_policy: SyncPolicy { + automated: Automated { + prune: true, + self_heal: true, + allow_empty: false, + }, + sync_options: vec!["CreateNamespace=true".to_string()], + retry: Retry { + limit: 5, + backoff: Backoff { + duration: "5s".to_string(), + factor: 2, + max_duration: "3m".to_string(), + }, + }, + }, + revision_history_limit: 10, + destination_namespace: Some(target_namespace.to_string()), + } +} diff --git a/fleet/harmony-fleet-deploy/src/bin/fleet_release.rs b/fleet/harmony-fleet-deploy/src/bin/fleet_release.rs new file mode 100644 index 00000000..fa8ef3e0 --- /dev/null +++ b/fleet/harmony-fleet-deploy/src/bin/fleet_release.rs @@ -0,0 +1,270 @@ +//! `harmony-fleet-release` — build + push the image + helm chart for +//! one fleet component at a tagged version. +//! +//! Invoked by the per-component `release.sh` wrappers (which prefill +//! `--component`) and by the `.gitea/workflows/harmony-fleet-*.yaml` +//! CI jobs. The same binary is the developer-laptop fallback during +//! outages. +//! +//! Steps, in order, for the selected component: +//! +//! 1. `docker build` the canonical multi-stage Dockerfile against the +//! workspace root, tagged +//! `//:`. +//! 2. `docker push` that image. +//! 3. Hydrate the helm chart for the component, with the pushed image +//! reference baked into the manifest and `chart_version` set to +//! `` so the OCI chart artifact lands at the matching tag. +//! 4. `helm package` the chart directory into a tgz. +//! 5. `helm push` the tgz to `oci:///`. +//! +//! `docker` (not `podman`) because the existing build scripts and the +//! gitea `dind` runner both use it. `docker login ` and +//! `helm registry login ` are expected to have been run by +//! the caller (CI's `docker/login-action`, dev's manual login). +//! +//! Adding a new component is a new variant on [`Component`] plus a +//! match arm in [`Component::spec`] — no new binary, no new CLI for +//! users to learn. + +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +use anyhow::{Context, Result, bail}; +use clap::{Parser, ValueEnum}; +use harmony_fleet_deploy::operator::chart::{ChartOptions, build_chart}; +use tracing::info; + +#[derive(Parser, Debug)] +#[command( + name = "harmony-fleet-release", + about = "Build and push a fleet component's image + helm chart for a tagged release" +)] +struct Cli { + /// Which fleet component to release. Only `operator` is wired up + /// today; `agent` and `nats-callout` are reserved for their + /// upcoming pipelines and will bail with an unimplemented error + /// until then. + #[arg(long, value_enum)] + component: Component, + + /// Registry host, e.g. `hub.nationtech.io`. + #[arg(long)] + registry: String, + + /// Version tag for both image and chart, e.g. `v0.1.0`. A leading + /// `v` is stripped from the chart-version (helm rejects non-semver + /// chart versions, and the OCI tag stays whatever was passed). + #[arg(long)] + version: String, + + /// Project/namespace under the registry. Both image and chart + /// land under this path. + #[arg(long, default_value = "harmony")] + project: String, + + /// Build the image and package the chart but skip both pushes. + /// Useful for local smoke-tests on k3d (sideload the image, helm + /// install the local tgz) without polluting the production + /// registry. CI never sets this. + #[arg(long)] + no_push: bool, +} + +/// The set of fleet components that a release can target. App-scoped +/// CLI per ADR-023: one binary covers every component behind this +/// flag, rather than a binary per component. +#[derive(Copy, Clone, Debug, ValueEnum)] +enum Component { + Operator, + /// Reserved — agent release pipeline not wired up yet. + Agent, + /// Reserved — nats-callout release pipeline not wired up yet. + #[value(name = "nats-callout")] + NatsCallout, +} + +/// Per-component release recipe: where the image's Dockerfile lives +/// (relative to the workspace root) and what to name the published +/// image. Chart hydration is component-specific too; see +/// [`hydrate_chart`]. +struct ComponentSpec { + image_name: &'static str, + dockerfile: &'static str, +} + +impl Component { + fn spec(self) -> Result { + match self { + Component::Operator => Ok(ComponentSpec { + image_name: "harmony-fleet-operator", + dockerfile: "fleet/harmony-fleet-operator/Dockerfile", + }), + Component::Agent => bail!( + "agent release pipeline is not wired up yet — see \ + ROADMAP/fleet_platform/ci_cd_setup.md" + ), + Component::NatsCallout => bail!( + "nats-callout release pipeline is not wired up yet — see \ + ROADMAP/fleet_platform/ci_cd_setup.md" + ), + } + } +} + +fn main() -> Result<()> { + // Default to info so the release progress shows up without + // requiring RUST_LOG to be set; explicit RUST_LOG overrides. + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + ) + .with_writer(std::io::stderr) + .init(); + + let cli = Cli::parse(); + let spec = cli.component.spec()?; + + let workspace = workspace_root(); + let image_ref = format!( + "{}/{}/{}:{}", + cli.registry, cli.project, spec.image_name, cli.version + ); + let oci_repo = format!("oci://{}/{}", cli.registry, cli.project); + + info!("docker build {image_ref}"); + docker_build(&workspace, spec.dockerfile, &image_ref)?; + + if cli.no_push { + info!("skipping docker push (--no-push)"); + } else { + info!("docker push {image_ref}"); + docker_push(&image_ref)?; + } + + info!("generate chart (image={image_ref})"); + // Keep the tempdir alive for the lifetime of main so the tgz path + // is still on disk when --no-push prints it for the caller. + let tmp = tempfile::tempdir().context("creating chart tempdir")?; + let chart_dir = hydrate_chart(cli.component, &image_ref, &cli.version, tmp.path())?; + + info!("helm package {}", chart_dir.display()); + let tgz = helm_package(&chart_dir, tmp.path())?; + + if cli.no_push { + // Move the tgz out of the tempdir so it survives this process + // — otherwise the tempdir drop deletes it before the caller + // can `helm install` from it. + let dst = std::env::current_dir()? + .join(tgz.file_name().context("packaged chart has no filename")?); + std::fs::copy(&tgz, &dst).context("copying packaged chart to CWD")?; + info!(image = %image_ref, chart = %dst.display(), "built (no push)"); + } else { + info!("helm push {} {oci_repo}", tgz.display()); + helm_push(&tgz, &oci_repo)?; + info!( + image = %image_ref, + chart = %format!("{oci_repo}/{}:{}", spec.image_name, chart_version(&cli.version)), + "released" + ); + } + Ok(()) +} + +/// Component-specific chart hydration. Each component owns its own +/// `build_chart`-style entry point in `harmony_fleet_deploy`; this +/// match keeps the release binary the only place that needs to know +/// "which component → which chart builder". +fn hydrate_chart( + component: Component, + image_ref: &str, + version: &str, + output_dir: &Path, +) -> Result { + match component { + Component::Operator => build_chart(&ChartOptions { + output_dir: output_dir.to_path_buf(), + image: image_ref.to_string(), + image_pull_policy: "IfNotPresent".to_string(), + chart_version: Some(chart_version(version)), + ..ChartOptions::default() + }) + .context("building operator chart"), + // Agent and nats-callout already bailed in Component::spec(); + // reaching here would mean a bug. + Component::Agent | Component::NatsCallout => { + unreachable!("Component::spec() returns Err for unwired components") + } + } +} + +fn docker_build(workspace: &Path, dockerfile: &str, image_ref: &str) -> Result<()> { + run(Command::new("docker") + .args(["build", "-f", dockerfile, "-t", image_ref, "."]) + .current_dir(workspace)) +} + +fn docker_push(image_ref: &str) -> Result<()> { + run(Command::new("docker").args(["push", image_ref])) +} + +fn helm_package(chart_dir: &Path, out_dir: &Path) -> Result { + let output = Command::new("helm") + .args([ + "package", + chart_dir.to_str().context("chart_dir path not utf-8")?, + "-d", + out_dir.to_str().context("out_dir path not utf-8")?, + ]) + .stderr(Stdio::inherit()) + .output() + .context("spawning helm package")?; + if !output.status.success() { + bail!("helm package failed (status {})", output.status); + } + // helm prints the absolute path of the produced tgz on stdout + // ("Successfully packaged chart and saved it to: /path/to.tgz"). + // The final whitespace-separated token is that path. + let stdout = String::from_utf8(output.stdout).context("helm package stdout not utf-8")?; + let tgz = stdout + .split_whitespace() + .last() + .map(PathBuf::from) + .context("helm package produced no path on stdout")?; + Ok(tgz) +} + +fn helm_push(tgz: &Path, oci_repo: &str) -> Result<()> { + run(Command::new("helm").args([ + "push", + tgz.to_str().context("tgz path not utf-8")?, + oci_repo, + ])) +} + +fn run(cmd: &mut Command) -> Result<()> { + let status = cmd + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .status() + .with_context(|| format!("spawning {:?}", cmd.get_program()))?; + if !status.success() { + bail!("{:?} failed (status {})", cmd.get_program(), status); + } + Ok(()) +} + +fn workspace_root() -> PathBuf { + // CARGO_MANIFEST_DIR is fleet/harmony-fleet-deploy → workspace is two up. + let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join(".."); + root.canonicalize().unwrap_or(root) +} + +// Helm chart versions must be SemVer (no leading `v`). The OCI image +// tag is left untouched — Harbor accepts arbitrary tag strings. +fn chart_version(tag: &str) -> String { + tag.strip_prefix('v').unwrap_or(tag).to_string() +} diff --git a/fleet/harmony-fleet-deploy/src/lib.rs b/fleet/harmony-fleet-deploy/src/lib.rs index 48de4e20..0ab0d307 100644 --- a/fleet/harmony-fleet-deploy/src/lib.rs +++ b/fleet/harmony-fleet-deploy/src/lib.rs @@ -28,6 +28,7 @@ //! the role of the smoke test for now. pub mod agent; +pub mod argo; pub mod companion; pub mod nats; pub mod operator; diff --git a/fleet/harmony-fleet-deploy/src/main.rs b/fleet/harmony-fleet-deploy/src/main.rs index b07090b6..2994fc74 100644 --- a/fleet/harmony-fleet-deploy/src/main.rs +++ b/fleet/harmony-fleet-deploy/src/main.rs @@ -1,29 +1,16 @@ //! `harmony-fleet-deploy` — deploy the fleet stack to a cluster. -//! -//! Built on `harmony_cli::run` like the rest of the workspace's -//! deploy binaries (`harmony_agent_deploy`, …). The CLI offers a -//! minimal env-driven config and hands off to `harmony_cli`, which -//! provides the standard `--filter` / `--all` / `--list` selection -//! surface; pick a single component by filter or run them all. -//! -//! Topology default is [`K8sAnywhereTopology::from_env`] — local k3d -//! when `HARMONY_USE_LOCAL_K3D` flips, otherwise whatever cluster -//! `KUBECONFIG` points at. Per ADR-023 the binary's full set of -//! topologies is compile-time; for the moment only `K8sAnywhere` is -//! wired in, with `K8sBareTopology` planned for the next iteration. -//! -//! What the binary owns: assembling the Scores from environment -//! input. What it does **not** own: any handrolled k8s manifests, -//! any imperative bring-up loops, any auth secret rendering — that -//! all sits inside the `*Score` impls in [`harmony_fleet_deploy`]. use anyhow::{Context, Result}; use clap::Parser; use harmony::inventory::Inventory; +use harmony::modules::application::features::ArgoHelmScore; use harmony::score::Score; use harmony::topology::K8sAnywhereTopology; +use harmony_cli::Args as HarmonyCliArgs; use harmony_fleet_deploy::nats::UserPassCredentials; -use harmony_fleet_deploy::{FleetAgentScore, FleetNatsScore, FleetOperatorScore, agent::PodTarget}; +use harmony_fleet_deploy::{ + FleetAgentScore, FleetNatsScore, FleetOperatorScore, agent::PodTarget, argo, +}; #[derive(Parser, Debug)] #[command( @@ -88,6 +75,41 @@ struct CliConfig { /// NATS device password. Required. #[arg(long, env = "HARMONY_FLEET_NATS_DEVICE_PASS")] nats_device_pass: Option, + + /// Deploy the operator via Argo CD instead of harmony-direct + /// helm. Re-run with a different `--operator-chart-version` to + /// upgrade or roll back. + #[arg(long, env = "HARMONY_FLEET_USE_ARGO")] + use_argo: bool, + + #[arg( + long, + env = "HARMONY_FLEET_OPERATOR_CHART_REGISTRY", + default_value = "hub.nationtech.io" + )] + operator_chart_registry: String, + + #[arg( + long, + env = "HARMONY_FLEET_OPERATOR_CHART_PROJECT", + default_value = "harmony" + )] + operator_chart_project: String, + + #[arg( + long, + env = "HARMONY_FLEET_OPERATOR_CHART_VERSION", + default_value = "0.0.1" + )] + operator_chart_version: String, + + #[arg(long, env = "HARMONY_FLEET_ARGO_NAMESPACE", default_value = "argocd")] + argo_namespace: String, + + // Flattened so a single argv parse covers both this CLI and + // harmony_cli's `--yes` / `--filter` / `--all` / etc. + #[command(flatten)] + harmony_cli: HarmonyCliArgs, } impl CliConfig { @@ -124,10 +146,6 @@ async fn main() -> Result<()> { let device_user = creds.device_user.clone(); let device_pass = creds.device_pass.clone(); let nats = FleetNatsScore::user_pass(cli.namespace.clone(), cli.nats_node_port, creds); - let operator = FleetOperatorScore::new() - .namespace(cli.namespace.clone()) - .image(cli.operator_image.clone()) - .nats_url(nats.in_cluster_url()); let agent = FleetAgentScore::pod( cli.namespace.clone(), PodTarget::user_pass( @@ -139,14 +157,34 @@ async fn main() -> Result<()> { ), ); - let scores: Vec>> = - vec![Box::new(nats), Box::new(operator), Box::new(agent)]; + // `--use-argo` swaps the operator path only. NATS + agent stay + // direct in v1. + let scores: Vec>> = if cli.use_argo { + let argo = ArgoHelmScore { + namespace: cli.argo_namespace.clone(), + openshift: false, + ingress_class_name: None, + argo_apps: vec![argo::operator_application( + &cli.namespace, + &cli.operator_chart_registry, + &cli.operator_chart_project, + &cli.operator_chart_version, + )], + }; + vec![Box::new(nats), Box::new(argo), Box::new(agent)] + } else { + let operator = FleetOperatorScore::new() + .namespace(cli.namespace.clone()) + .image(cli.operator_image.clone()) + .nats_url(nats.in_cluster_url()); + vec![Box::new(nats), Box::new(operator), Box::new(agent)] + }; harmony_cli::run( Inventory::autoload(), K8sAnywhereTopology::from_env(), scores, - None, + Some(cli.harmony_cli), ) .await .map_err(|e| anyhow::anyhow!("{e}")) diff --git a/fleet/harmony-fleet-deploy/src/operator/chart.rs b/fleet/harmony-fleet-deploy/src/operator/chart.rs index ca9cf73d..779c246e 100644 --- a/fleet/harmony-fleet-deploy/src/operator/chart.rs +++ b/fleet/harmony-fleet-deploy/src/operator/chart.rs @@ -51,11 +51,6 @@ pub struct ChartOptions { /// sideloaded k3d images, `Never` if the image must already be /// present. pub image_pull_policy: String, - /// Namespace the operator Deployment runs in. `helm install - /// --create-namespace` creates it if absent; the chart itself - /// doesn't include a Namespace resource so the chart stays - /// reusable across namespaces. - pub namespace: String, /// NATS URL the operator connects to. For in-cluster NATS at /// `fleet-nats.fleet-system` the default `nats://fleet-nats.fleet-system:4222` /// works with no config. @@ -67,6 +62,12 @@ pub struct ChartOptions { /// Secret entirely and lets the operator connect to NATS without /// auth — only sensible when there's no callout in front of NATS. pub credentials: Option, + /// Chart-level version written into `Chart.yaml`. `None` falls back + /// to the deploy crate's `CARGO_PKG_VERSION` — fine for in-process + /// uses (e2e harness, runtime operator Score). The release binary + /// sets this to the released tag so the OCI chart artifact lands + /// at `…/harmony-fleet-operator:` matching the image tag. + pub chart_version: Option, } /// What the operator pod needs to authenticate to NATS via the auth @@ -107,10 +108,14 @@ impl Default for ChartOptions { output_dir: PathBuf::from("/tmp/fleet-load-test/chart"), image: "localhost/harmony-fleet-operator:latest".to_string(), image_pull_policy: "IfNotPresent".to_string(), - namespace: "fleet-system".to_string(), - nats_url: "nats://fleet-nats.fleet-system:4222".to_string(), + // Deliberately uses a non-fleet-specific in-cluster DNS + // assuming NATS sits in the same namespace as the operator; + // the e2e harness and production overrides set this + // explicitly when their NATS lives elsewhere. + nats_url: "nats://fleet-nats:4222".to_string(), log_level: "info,kube_runtime=warn".to_string(), credentials: None, + chart_version: None, } } } @@ -131,10 +136,17 @@ pub fn build_chart(opts: &ChartOptions) -> Result { std::fs::create_dir_all(&opts.output_dir) .with_context(|| format!("creating {:?}", opts.output_dir))?; - let mut chart = HelmChart::new( - RELEASE_NAME.to_string(), - env!("CARGO_PKG_VERSION").to_string(), - ); + // `HelmChart::new(name, app_version)` only sets appVersion — the + // chart-level `version` field defaults to `"0.1.0"` and has to be + // assigned directly. For a release artifact we want both to track + // the released tag (one tag → one image + chart at the same + // version), so set both. + let chart_version = opts + .chart_version + .clone() + .unwrap_or_else(|| env!("CARGO_PKG_VERSION").to_string()); + let mut chart = HelmChart::new(RELEASE_NAME.to_string(), chart_version.clone()); + chart.version = chart_version; chart.description = "IoT operator — Deployment CRD → NATS KV".to_string(); chart.add_resource(HelmResourceKind::Crd(crd_with_keep_annotation( @@ -144,12 +156,18 @@ pub fn build_chart(opts: &ChartOptions) -> Result { Device::crd(), ))); - chart.add_resource(HelmResourceKind::ServiceAccount(service_account( - &opts.namespace, - ))); + chart.add_resource(HelmResourceKind::ServiceAccount(service_account())); chart.add_resource(HelmResourceKind::ClusterRole(cluster_role())); + // The CRB's subject must reference the ServiceAccount's namespace. + // Since the chart itself is namespace-neutral (helm assigns the + // release namespace to the SA + Deployment at install time), we + // emit a literal helm template token so helm substitutes the + // release namespace at the same moment. This is the one chart + // resource that can't be made namespace-neutral by simply omitting + // the field — `subjects[].namespace` is part of the resource + // identity and must point somewhere concrete after rendering. chart.add_resource(HelmResourceKind::ClusterRoleBinding(cluster_role_binding( - &opts.namespace, + "{{ .Release.Namespace }}", ))); // Secret intentionally NOT included in the on-disk helm chart — // credentials are operator-environment-specific and out of scope @@ -175,10 +193,13 @@ pub fn operator_secret(opts: &ChartOptions) -> Option { SECRET_KEY_CREDENTIALS_TOML.to_string(), ByteString(creds.credentials_toml.as_bytes().to_vec()), ); + // Namespace deliberately omitted — the caller passes the target + // namespace to `K8sResourceScore::single`, which injects it at + // apply time. Keeps the Secret manifest reusable across + // environments without baking a namespace into source. Some(Secret { metadata: ObjectMeta { name: Some(SECRET_NAME.to_string()), - namespace: Some(opts.namespace.clone()), ..Default::default() }, data: Some(data), @@ -201,11 +222,13 @@ fn crd_with_keep_annotation(mut crd: CustomResourceDefinition) -> CustomResource crd } -fn service_account(namespace: &str) -> ServiceAccount { +// Namespace-neutral: helm fills in the release namespace at install +// time, and the direct-apply path (`K8sResourceScore::single(sa, +// Some(ns))`) injects the namespace through its second argument. +fn service_account() -> ServiceAccount { ServiceAccount { metadata: ObjectMeta { name: Some(SERVICE_ACCOUNT.to_string()), - namespace: Some(namespace.to_string()), ..Default::default() }, ..Default::default() @@ -325,10 +348,14 @@ fn operator_deployment(opts: &ChartOptions) -> K8sDeployment { }); } + // Namespace deliberately omitted — same rationale as the + // ServiceAccount: helm fills in the release namespace at install + // time, and the direct-apply path injects it via + // `K8sResourceScore::single(.., Some(ns))`. Keeps the chart + // reusable without baking a namespace into source. K8sDeployment { metadata: ObjectMeta { name: Some(RELEASE_NAME.to_string()), - namespace: Some(opts.namespace.clone()), labels: Some(match_labels.clone()), ..Default::default() }, @@ -364,14 +391,20 @@ fn operator_deployment(opts: &ChartOptions) -> K8sDeployment { // Re-export the manifest builders so the e2e bring-up can apply the // operator inline (Score-style) without re-implementing the manifests. -pub fn build_service_account(opts: &ChartOptions) -> ServiceAccount { - service_account(&opts.namespace) +// +// The SA + Deployment helpers return namespace-neutral manifests; +// callers inject the target namespace through `K8sResourceScore::single`. +// The CRB takes the SA's namespace as an explicit argument because the +// CRB subject must reference a concrete namespace — there's no +// kube-side "current namespace" for cluster-scoped resources. +pub fn build_service_account() -> ServiceAccount { + service_account() } pub fn build_cluster_role() -> ClusterRole { cluster_role() } -pub fn build_cluster_role_binding(opts: &ChartOptions) -> ClusterRoleBinding { - cluster_role_binding(&opts.namespace) +pub fn build_cluster_role_binding(subject_namespace: &str) -> ClusterRoleBinding { + cluster_role_binding(subject_namespace) } pub fn build_operator_deployment(opts: &ChartOptions) -> K8sDeployment { operator_deployment(opts) diff --git a/fleet/harmony-fleet-deploy/src/operator/score.rs b/fleet/harmony-fleet-deploy/src/operator/score.rs index 44434e70..c57da2fe 100644 --- a/fleet/harmony-fleet-deploy/src/operator/score.rs +++ b/fleet/harmony-fleet-deploy/src/operator/score.rs @@ -65,7 +65,9 @@ impl FleetOperatorScore { pub fn new() -> Self { let defaults = ChartOptions::default(); Self { - namespace: defaults.namespace, + // FleetOperatorScore's own default; the chart itself is + // namespace-neutral. Callers override via `.namespace(..)`. + namespace: "fleet-system".to_string(), release_name: "harmony-fleet-operator".to_string(), image: defaults.image, image_pull_policy: defaults.image_pull_policy, @@ -146,10 +148,10 @@ impl Interpret for FleetOperatorInterp output_dir: tmp.path().to_path_buf(), image: self.score.image.clone(), image_pull_policy: self.score.image_pull_policy.clone(), - namespace: self.score.namespace.clone(), nats_url: self.score.nats_url.clone(), log_level: self.score.log_level.clone(), credentials: self.score.credentials.clone(), + chart_version: None, }; // Apply the credentials Secret BEFORE the helm install. The diff --git a/fleet/harmony-fleet-operator/release.sh b/fleet/harmony-fleet-operator/release.sh new file mode 100755 index 00000000..ddffa459 --- /dev/null +++ b/fleet/harmony-fleet-operator/release.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Build + push the harmony-fleet-operator image and helm chart at one +# matching version. Invoked locally and by .gitea CI. +# +# ./fleet/harmony-fleet-operator/release.sh +# ./fleet/harmony-fleet-operator/release.sh hub.nationtech.io v0.1.0 +# +# Expects `docker login ` and `helm registry login ` +# to have already been run; both are cheap one-liners and let CI use the +# same script unchanged. +# +# This is the operator-specific 1-line wrapper around the app-scoped +# `harmony-fleet-release` binary. The wrapper exists so a tag like +# `harmony-fleet-operator-v0.1.0` routes straight to the right +# `--component` without the caller having to remember the flag. Agent +# and nats-callout will get sibling `release.sh` scripts the same way. +# +# All heavy lifting (docker build/push, chart hydration, helm +# package/push) is in the binary; this script just selects the +# component. + +set -euo pipefail + +REGISTRY="${1:?usage: release.sh }" +VERSION="${2:?usage: release.sh }" + +cd "$(dirname "$0")/../.." + +exec cargo run --release -p harmony-fleet-deploy \ + --bin harmony-fleet-release -- \ + --component operator \ + --registry "$REGISTRY" --version "$VERSION" diff --git a/harmony/src/modules/application/features/argo_types.rs b/harmony/src/modules/application/features/argo_types.rs index 7e88e34b..072cb8e6 100644 --- a/harmony/src/modules/application/features/argo_types.rs +++ b/harmony/src/modules/application/features/argo_types.rs @@ -80,6 +80,11 @@ pub struct ArgoApplication { pub source: Source, pub sync_policy: SyncPolicy, pub revision_history_limit: u32, + /// Cluster namespace the synced resources land in. `None` collapses + /// to the same namespace as the Application CR — fine when both are + /// `argocd`, wrong when the Application controls a chart in a + /// different namespace. + pub destination_namespace: Option, } impl Default for ArgoApplication { @@ -127,6 +132,7 @@ impl Default for ArgoApplication { }, }, revision_history_limit: 10, + destination_namespace: None, } } } @@ -186,6 +192,7 @@ impl ArgoApplication { let default_ns = "argocd".to_string(); let namespace: &str = target_namespace.unwrap_or(self.namespace.as_ref().unwrap_or(&default_ns)); + let destination = self.destination_namespace.as_deref().unwrap_or(namespace); let project = &self.project; let yaml_str = format!( @@ -194,21 +201,12 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: name: {name} - # You'll usually want to add your resources to the argocd namespace. namespace: {namespace} spec: - # The project the application belongs to. project: {project} - - # Destination cluster and namespace to deploy the application destination: - # cluster API URL server: https://kubernetes.default.svc - # or cluster name - # name: in-cluster - # The namespace will only be set for namespace-scoped resources that have not set a value for .metadata.namespace - namespace: {namespace} - + namespace: {destination} "# ); @@ -305,6 +303,7 @@ mod tests { }, }, revision_history_limit: 10, + destination_namespace: None, }; let expected_yaml_output = r#"apiVersion: argoproj.io/v1alpha1 diff --git a/harmony/src/modules/application/features/helm_argocd_score.rs b/harmony/src/modules/application/features/helm_argocd_score.rs index 6f02a221..7ac5d6bd 100644 --- a/harmony/src/modules/application/features/helm_argocd_score.rs +++ b/harmony/src/modules/application/features/helm_argocd_score.rs @@ -26,6 +26,8 @@ pub struct ArgoHelmScore { pub namespace: String, // TODO: remove and rely on topology (it now knows the flavor) pub openshift: bool, + /// IngressClass for Argo's server Ingress. `None` → cluster default. + pub ingress_class_name: Option, pub argo_apps: Vec, } @@ -109,7 +111,12 @@ impl Interpret for ArgoInter info!("ArgoCD will be installed : {must_install} . Current argocd status : {current:?} "); if must_install { - let helm_score = argo_helm_chart_score(&desired_ns, self.score.openshift, &domain); + let helm_score = argo_helm_chart_score( + &desired_ns, + self.score.openshift, + &domain, + self.score.ingress_class_name.as_deref(), + ); info!( "Installing Argo CD via Helm into namespace '{}' ...", desired_ns @@ -167,7 +174,17 @@ impl Interpret for ArgoInter } } -pub fn argo_helm_chart_score(namespace: &str, openshift: bool, domain: &str) -> HelmChartScore { +pub fn argo_helm_chart_score( + namespace: &str, + openshift: bool, + domain: &str, + ingress_class_name: Option<&str>, +) -> HelmChartScore { + // Empty IngressClass → cluster default kicks in (k3d/traefik, etc). + let ingress_class_name = ingress_class_name.unwrap_or(""); + // `runAsUser: null` is for OKD's restricted-v2 SCC; on vanilla k8s + // it makes redis CrashLoop ("image will run as root"). + let security_context_override = if openshift { "runAsUser: null" } else { "{}" }; let values = format!( r#" # -- Create aggregated roles that extend existing cluster roles to interact with argo-cd resources @@ -198,8 +215,7 @@ global: ## Used for ingresses, certificates, SSO, notifications, etc. domain: {domain} - securityContext: - runAsUser: null + securityContext: {security_context_override} # -- Runtime class name for all components runtimeClassName: "" @@ -515,8 +531,7 @@ redis: serviceAccount: create: true - securityContext: - runAsUser: null + securityContext: {security_context_override} ## Redis image @@ -752,7 +767,7 @@ server: # nginx.ingress.kubernetes.io/ssl-passthrough: "true" # -- Defines which ingress controller will implement the resource - ingressClassName: "openshift-default" + ingressClassName: "{ingress_class_name}" # -- Argo CD server hostname # @default -- `""` (defaults to global.domain) diff --git a/harmony/src/modules/application/features/packaging_deployment.rs b/harmony/src/modules/application/features/packaging_deployment.rs index 4bea8da9..c6703d0b 100644 --- a/harmony/src/modules/application/features/packaging_deployment.rs +++ b/harmony/src/modules/application/features/packaging_deployment.rs @@ -185,6 +185,10 @@ impl< let score = ArgoHelmScore { namespace: self.application.name().to_string(), openshift: true, + // Pre-existing call site is OpenShift-only (see + // the surrounding match arm); keep the OKD + // ingress class wired in to preserve behavior. + ingress_class_name: Some("openshift-default".to_string()), argo_apps: vec![ArgoApplication::from(CDApplicationConfig { version: Version::from("0.2.1").unwrap(), helm_chart_repo_url: "hub.nationtech.io/harmony".to_string(),