From 9b94fc12a9e41dd4af05ce0f2d1f6985f0931b1c Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 17:42:14 -0400 Subject: [PATCH 01/11] feat(fleet-operator): real dashboard data from kube CRs + NATS KV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RealFleetService implements FleetService against the same sources the reconcile loop owns, read-only: - Device/Deployment CRs (kube) for the registry, desired intent, and the aggregator-maintained .status.aggregate (target/healthy/failing/ pending counts, deployment status). - device-heartbeat KV → last ping + device status (Stale after 90s). - device-state KV → per-device phase → Failing/Pending, primary deployment. Status, dashboard counts, and alerts (one critical per failing deployment, one warning per stale device; acks held in-memory) are all derived from live state. Deployment version is the first service's image tag. blacklist_device patches a label on the Device CR; run_command stays a seam (needs agent-side transport). serve_web now connects NATS + kube and builds RealFleetService when not --mock (the bail is gone); --mock still uses the seeded MockFleetService for offline UI work. Reads are on-demand per request — fine at staging scale, a cache can follow. Unit tests cover status derivation, primary-deployment selection, version parsing, and alert derivation. --- fleet/harmony-fleet-operator/src/main.rs | 26 +- .../harmony-fleet-operator/src/service/mod.rs | 1 + .../src/service/real.rs | 605 ++++++++++++++++++ 3 files changed, 626 insertions(+), 6 deletions(-) create mode 100644 fleet/harmony-fleet-operator/src/service/real.rs diff --git a/fleet/harmony-fleet-operator/src/main.rs b/fleet/harmony-fleet-operator/src/main.rs index 781fbc2d..e6a1f0de 100644 --- a/fleet/harmony-fleet-operator/src/main.rs +++ b/fleet/harmony-fleet-operator/src/main.rs @@ -105,7 +105,17 @@ async fn main() -> Result<()> { addr, css_from, live_reload, - } => serve_web(mock, addr, css_from, live_reload).await, + } => { + serve_web( + mock, + addr, + css_from, + live_reload, + &cli.nats_url, + &cli.credentials_toml, + ) + .await + } } } @@ -115,20 +125,24 @@ async fn serve_web( addr: std::net::SocketAddr, css_from: Option, live_reload: bool, + nats_url: &str, + credentials_toml: &str, ) -> Result<()> { use std::sync::Arc; use std::time::Duration; use frontend::server::{AppState, Config}; - use service::{FleetService, mock::MockFleetService}; + use service::{FleetService, mock::MockFleetService, real::RealFleetService}; let fleet: Arc = if mock { Arc::new(MockFleetService::default()) } else { - anyhow::bail!( - "serve-web without --mock is not implemented yet (real FleetService impl pending). \ - Pass --mock for the dev workflow." - ); + // Same NATS + kube sources the reconcile loop reads, projected + // read-only into the dashboard. + let nats = connect_with_retry(nats_url, credentials_toml).await?; + let js = jetstream::new(nats); + let kube = Client::try_default().await?; + Arc::new(RealFleetService::new(kube, js).await?) }; let cookie_key = harmony_zitadel_auth::cookie_key_from_env(); diff --git a/fleet/harmony-fleet-operator/src/service/mod.rs b/fleet/harmony-fleet-operator/src/service/mod.rs index 48bc78ac..fb60b2e4 100644 --- a/fleet/harmony-fleet-operator/src/service/mod.rs +++ b/fleet/harmony-fleet-operator/src/service/mod.rs @@ -1,4 +1,5 @@ pub mod mock; +pub mod real; use async_trait::async_trait; use chrono::{DateTime, Utc}; diff --git a/fleet/harmony-fleet-operator/src/service/real.rs b/fleet/harmony-fleet-operator/src/service/real.rs new file mode 100644 index 00000000..2433a0a5 --- /dev/null +++ b/fleet/harmony-fleet-operator/src/service/real.rs @@ -0,0 +1,605 @@ +//! Live [`FleetService`] backed by Kubernetes CRs + NATS KV. +//! +//! Read-only projections of the same sources the reconcile loop owns: +//! - `Device` / `Deployment` CRs (kube) — registry + desired intent +//! and the aggregator-maintained `.status.aggregate`. +//! - `device-heartbeat` KV — liveness, drives device status + last-ping. +//! - `device-state` KV — per-(device,deployment) phase, drives the +//! device's primary deployment + failing/pending derivation. +//! +//! Reads are on-demand per request (fine at staging scale); a cache can +//! follow if device counts grow. Nothing here writes desired state — +//! the only mutation is the blacklist label patch. + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::Mutex; + +use anyhow::Context as _; +use async_nats::jetstream::{self, kv::Store}; +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use futures_util::StreamExt; +use kube::api::{Api, ListParams, Patch, PatchParams}; +use kube::{Client, ResourceExt}; +use serde_json::json; + +use harmony::modules::fleet::operator::{Deployment as DeploymentCr, Device as DeviceCr}; +use harmony::modules::podman::ReconcileScore; +use harmony_fleet_operator::fleet_aggregator::selector_matches; +use harmony_reconciler_contracts::{ + BUCKET_DEVICE_HEARTBEAT, BUCKET_DEVICE_STATE, DeploymentState, HeartbeatPayload, Phase, +}; + +use super::{ + Alert, AlertSeverity, DashboardDetail, DeploymentDetail, DeploymentStatus, DeviceDetail, + DeviceStatus, FleetService, +}; + +/// Label the operator UI sets to quarantine a device. Read back here to +/// surface `Blacklisted` status; the reconcile-side exclusion is a +/// follow-up. +const BLACKLIST_LABEL: &str = "fleet.nationtech.io/blacklisted"; +/// Label key the agent uses for a device's region, if it sets one. +const REGION_LABEL: &str = "region"; +/// A device that hasn't pinged within this window is `Stale`. Agents +/// heartbeat every 30 s, so this tolerates ~2 missed pings. +const STALE_AFTER_SECS: i64 = 90; + +pub struct RealFleetService { + kube: Client, + state_kv: Store, + heartbeat_kv: Store, + /// In-memory ack set. Alerts are derived from live state and have + /// no store of their own yet, so acks don't survive a restart. + acked_alerts: Mutex>, +} + +impl RealFleetService { + pub async fn new(kube: Client, js: jetstream::Context) -> anyhow::Result { + let state_kv = open_kv(&js, BUCKET_DEVICE_STATE).await?; + let heartbeat_kv = open_kv(&js, BUCKET_DEVICE_HEARTBEAT).await?; + Ok(Self { + kube, + state_kv, + heartbeat_kv, + acked_alerts: Mutex::new(HashSet::new()), + }) + } + + async fn device_crs(&self) -> anyhow::Result> { + let api: Api = Api::all(self.kube.clone()); + Ok(api.list(&ListParams::default()).await?.items) + } + + async fn deployment_crs(&self) -> anyhow::Result> { + let api: Api = Api::all(self.kube.clone()); + Ok(api.list(&ListParams::default()).await?.items) + } + + /// device_id → last heartbeat timestamp. + async fn heartbeats(&self) -> HashMap> { + let mut out = HashMap::new(); + let Ok(mut keys) = self.heartbeat_kv.keys().await else { + return out; + }; + while let Some(Ok(key)) = keys.next().await { + if let Ok(Some(bytes)) = self.heartbeat_kv.get(&key).await + && let Ok(hb) = serde_json::from_slice::(&bytes) + { + out.insert(hb.device_id.to_string(), hb.at); + } + } + out + } + + /// device_id → its per-deployment states. + async fn states_by_device(&self) -> HashMap> { + let mut out: HashMap> = HashMap::new(); + let Ok(mut keys) = self.state_kv.keys().await else { + return out; + }; + while let Some(Ok(key)) = keys.next().await { + if let Ok(Some(bytes)) = self.state_kv.get(&key).await + && let Ok(ds) = serde_json::from_slice::(&bytes) + { + out.entry(ds.device_id.to_string()).or_default().push(ds); + } + } + out + } + + /// Assemble every device with its heartbeat + state joined in. + async fn devices(&self) -> anyhow::Result> { + let crs = self.device_crs().await?; + let heartbeats = self.heartbeats().await; + let states = self.states_by_device().await; + let now = Utc::now(); + let mut devices: Vec = crs + .iter() + .map(|cr| { + let id = cr.name_any(); + map_device( + cr, + heartbeats.get(&id).copied(), + states.get(&id).map(Vec::as_slice).unwrap_or(&[]), + now, + ) + }) + .collect(); + devices.sort_by(|a, b| a.id.cmp(&b.id)); + Ok(devices) + } + + async fn deployments(&self) -> anyhow::Result> { + let mut deployments: Vec = self + .deployment_crs() + .await? + .iter() + .map(map_deployment) + .collect(); + deployments.sort_by(|a, b| a.name.cmp(&b.name)); + Ok(deployments) + } +} + +async fn open_kv(js: &jetstream::Context, bucket: &str) -> anyhow::Result { + js.create_key_value(jetstream::kv::Config { + bucket: bucket.to_string(), + ..Default::default() + }) + .await + .with_context(|| format!("opening KV bucket {bucket}")) +} + +fn map_device( + cr: &DeviceCr, + heartbeat: Option>, + states: &[DeploymentState], + now: DateTime, +) -> DeviceDetail { + let id = cr.name_any(); + let labels = cr.metadata.labels.clone().unwrap_or_default(); + let blacklisted = labels.get(BLACKLIST_LABEL).map(String::as_str) == Some("true"); + let last_seen = heartbeat + .or_else(|| cr.creation_timestamp().map(|t| t.0)) + .unwrap_or(now); + + DeviceDetail { + id, + status: derive_status(blacklisted, heartbeat, states, now), + last_seen, + minutes_ago: (now - last_seen).num_minutes().max(0), + deployment: primary_deployment(states), + region: labels + .get(REGION_LABEL) + .cloned() + .unwrap_or_else(|| "\u{2014}".to_string()), + tags: tags_from_labels(&labels), + inventory: cr.spec.inventory.clone(), + } +} + +fn derive_status( + blacklisted: bool, + heartbeat: Option>, + states: &[DeploymentState], + now: DateTime, +) -> DeviceStatus { + if blacklisted { + return DeviceStatus::Blacklisted; + } + let Some(at) = heartbeat else { + return DeviceStatus::Unknown; + }; + if (now - at).num_seconds() > STALE_AFTER_SECS { + return DeviceStatus::Stale; + } + if states.iter().any(|s| s.phase == Phase::Failed) { + return DeviceStatus::Failing; + } + if states.iter().any(|s| s.phase == Phase::Pending) { + return DeviceStatus::Pending; + } + DeviceStatus::Healthy +} + +/// The device's primary deployment for the list column: a failing one +/// if any (most worth surfacing), else the first by name. +fn primary_deployment(states: &[DeploymentState]) -> Option { + let mut sorted: Vec<&DeploymentState> = states.iter().collect(); + sorted.sort_by(|a, b| a.deployment.as_str().cmp(b.deployment.as_str())); + sorted + .iter() + .find(|s| s.phase == Phase::Failed) + .or_else(|| sorted.first()) + .map(|s| s.deployment.to_string()) +} + +/// Routing labels rendered as `k=v` chips, minus the internal +/// blacklist marker. +fn tags_from_labels(labels: &BTreeMap) -> Vec { + labels + .iter() + .filter(|(k, _)| k.as_str() != BLACKLIST_LABEL && k.as_str() != REGION_LABEL) + .map(|(k, v)| format!("{k}={v}")) + .collect() +} + +fn map_deployment(cr: &DeploymentCr) -> DeploymentDetail { + let agg = cr + .status + .as_ref() + .and_then(|s| s.aggregate.clone()) + .unwrap_or_default(); + let status = if agg.failed > 0 { + DeploymentStatus::Failing + } else if agg.pending > 0 { + DeploymentStatus::Rolling + } else { + DeploymentStatus::Active + }; + DeploymentDetail { + name: cr.name_any(), + version: deployment_version(&cr.spec.score), + status, + target: agg.matched_device_count, + healthy: agg.succeeded, + failing: agg.failed, + pending: agg.pending, + updated_at: cr + .creation_timestamp() + .map(|t| t.0.format("%Y-%m-%d %H:%M").to_string()) + .unwrap_or_else(|| "\u{2014}".to_string()), + } +} + +/// No first-class version on the CR; use the first service's image tag +/// as the human-facing proxy (`nginx:1.25` → `1.25`). +fn deployment_version(score: &ReconcileScore) -> String { + let ReconcileScore::PodmanV0(s) = score; + s.services + .first() + .map(|svc| { + svc.image + .rsplit_once(':') + .map(|(_, tag)| tag.to_string()) + .unwrap_or_else(|| svc.image.clone()) + }) + .unwrap_or_else(|| "\u{2014}".to_string()) +} + +/// Alerts derived from current state (no alert store yet): one critical +/// per failing deployment, one warning per stale device. +fn derive_alerts( + devices: &[DeviceDetail], + deployments: &[DeploymentDetail], + acked: &HashSet, +) -> Vec { + let mut alerts = Vec::new(); + for d in deployments { + if d.failing > 0 { + let id = format!("dep:{}:failing", d.name); + alerts.push(Alert { + acked: acked.contains(&id), + id, + severity: AlertSeverity::Critical, + title: format!("{} has {} failing device(s)", d.name, d.failing), + deployment: Some(d.name.clone()), + device: None, + at: String::new(), + }); + } + } + for dev in devices { + if dev.status == DeviceStatus::Stale { + let id = format!("device:{}:stale", dev.id); + alerts.push(Alert { + acked: acked.contains(&id), + id, + severity: AlertSeverity::Warning, + title: format!("{} last pinged {}m ago", dev.id, dev.minutes_ago), + deployment: dev.deployment.clone(), + device: Some(dev.id.clone()), + at: String::new(), + }); + } + } + alerts +} + +#[async_trait] +impl FleetService for RealFleetService { + async fn dashboard_detail(&self) -> anyhow::Result { + let devices = self.devices().await?; + let mut deployments = self.deployments().await?; + let acked = self.acked_alerts.lock().unwrap().clone(); + let active_alerts = derive_alerts(&devices, &deployments, &acked) + .into_iter() + .filter(|a| !a.acked) + .collect(); + + let mut d = DashboardDetail { + devices_total: devices.len() as u32, + devices_healthy: 0, + devices_pending: 0, + devices_failing: 0, + devices_stale: 0, + devices_blacklisted: 0, + devices_unknown: 0, + deployments_total: deployments.len(), + health_pct: 0, + attention_devices: devices + .iter() + .filter(|d| { + matches!( + d.status, + DeviceStatus::Failing | DeviceStatus::Stale | DeviceStatus::Pending + ) + }) + .take(12) + .cloned() + .collect(), + top_deployments: Vec::new(), + active_alerts, + rolling_count: 0, + failing_count: 0, + }; + for dev in &devices { + match dev.status { + DeviceStatus::Healthy => d.devices_healthy += 1, + DeviceStatus::Pending => d.devices_pending += 1, + DeviceStatus::Stale => d.devices_stale += 1, + DeviceStatus::Failing => d.devices_failing += 1, + DeviceStatus::Blacklisted => d.devices_blacklisted += 1, + DeviceStatus::Unknown => d.devices_unknown += 1, + } + } + if d.devices_total > 0 { + d.health_pct = + ((d.devices_healthy as f64 / d.devices_total as f64) * 100.0).round() as u32; + } + for dep in &deployments { + match dep.status { + DeploymentStatus::Rolling => d.rolling_count += 1, + DeploymentStatus::Failing => d.failing_count += 1, + _ => {} + } + } + deployments.truncate(4); + d.top_deployments = deployments; + Ok(d) + } + + async fn list_devices(&self) -> anyhow::Result> { + self.devices().await + } + + async fn get_device(&self, id: &str) -> anyhow::Result> { + Ok(self.devices().await?.into_iter().find(|d| d.id == id)) + } + + async fn list_deployments(&self) -> anyhow::Result> { + self.deployments().await + } + + async fn get_deployment(&self, name: &str) -> anyhow::Result> { + Ok(self + .deployments() + .await? + .into_iter() + .find(|d| d.name == name)) + } + + async fn get_deployment_devices(&self, name: &str) -> anyhow::Result> { + let Some(cr) = self + .deployment_crs() + .await? + .into_iter() + .find(|c| c.name_any() == name) + else { + return Ok(Vec::new()); + }; + let selector = cr.spec.target_selector; + Ok(self + .devices() + .await? + .into_iter() + .filter(|d| { + // Re-derive the label set from tags (`k=v`) to match the + // CR selector — the aggregator's authoritative path. + let labels = labels_from_tags(&d.tags, &d.region); + selector_matches(&selector, &labels) + }) + .collect()) + } + + async fn blacklist_device(&self, id: &str) -> anyhow::Result { + let api: Api = Api::all(self.kube.clone()); + let patch = json!({ "metadata": { "labels": { BLACKLIST_LABEL: "true" } } }); + api.patch(id, &PatchParams::default(), &Patch::Merge(&patch)) + .await + .with_context(|| format!("patching blacklist label on device {id}"))?; + self.get_device(id) + .await? + .ok_or_else(|| anyhow::anyhow!("device {id} not found after blacklist")) + } + + async fn list_alerts(&self) -> anyhow::Result> { + let devices = self.devices().await?; + let deployments = self.deployments().await?; + let acked = self.acked_alerts.lock().unwrap().clone(); + Ok(derive_alerts(&devices, &deployments, &acked)) + } + + async fn ack_alert(&self, id: &str) -> anyhow::Result { + Ok(self.acked_alerts.lock().unwrap().insert(id.to_string())) + } + + async fn run_command(&self, _device_id: &str, command: &str) -> anyhow::Result { + // Seam only: the device round-trip (publish to the agent over + // NATS, stream stdout/stderr back) is a follow-up that needs + // agent-side support. + Ok(format!( + "$ {command}\n[device command transport not implemented yet]" + )) + } + + async fn filtered_devices( + &self, + status: Option, + deployment: Option, + region: Option, + search: Option, + ) -> anyhow::Result> { + let search = search.map(|s| s.to_lowercase()); + Ok(self + .devices() + .await? + .into_iter() + .filter(|d| { + status.is_none_or(|s| d.status == s) + && deployment + .as_deref() + .is_none_or(|dep| d.deployment.as_deref() == Some(dep)) + && region.as_deref().is_none_or(|r| d.region == r) + && search.as_deref().is_none_or(|q| { + d.id.to_lowercase().contains(q) + || d.tags.iter().any(|t| t.to_lowercase().contains(q)) + || d.deployment + .as_deref() + .is_some_and(|dep| dep.to_lowercase().contains(q)) + }) + }) + .collect()) + } +} + +/// Inverse of [`tags_from_labels`] + the region label, so a +/// `DeviceDetail` can be re-matched against a CR `LabelSelector`. +fn labels_from_tags(tags: &[String], region: &str) -> BTreeMap { + let mut labels: BTreeMap = tags + .iter() + .filter_map(|t| { + t.split_once('=') + .map(|(k, v)| (k.to_string(), v.to_string())) + }) + .collect(); + if region != "\u{2014}" { + labels.insert(REGION_LABEL.to_string(), region.to_string()); + } + labels +} + +#[cfg(test)] +mod tests { + use super::*; + use harmony::modules::podman::{PodmanService, PodmanV0Score}; + use harmony_reconciler_contracts::{DeploymentName, Id}; + + fn ds(deployment: &str, phase: Phase) -> DeploymentState { + DeploymentState { + device_id: Id::from("pi-01".to_string()), + deployment: DeploymentName::try_new(deployment).unwrap(), + phase, + last_event_at: Utc::now(), + last_error: None, + } + } + + #[test] + fn status_blacklist_beats_everything() { + assert_eq!( + derive_status( + true, + Some(Utc::now()), + &[ds("a", Phase::Running)], + Utc::now() + ), + DeviceStatus::Blacklisted + ); + } + + #[test] + fn status_no_heartbeat_is_unknown() { + assert_eq!( + derive_status(false, None, &[], Utc::now()), + DeviceStatus::Unknown + ); + } + + #[test] + fn status_stale_when_heartbeat_old() { + let old = Utc::now() - chrono::Duration::seconds(STALE_AFTER_SECS + 10); + assert_eq!( + derive_status(false, Some(old), &[ds("a", Phase::Running)], Utc::now()), + DeviceStatus::Stale + ); + } + + #[test] + fn status_failing_then_pending_then_healthy() { + let now = Utc::now(); + let fresh = Some(now); + assert_eq!( + derive_status(false, fresh, &[ds("a", Phase::Failed)], now), + DeviceStatus::Failing + ); + assert_eq!( + derive_status(false, fresh, &[ds("a", Phase::Pending)], now), + DeviceStatus::Pending + ); + assert_eq!( + derive_status(false, fresh, &[ds("a", Phase::Running)], now), + DeviceStatus::Healthy + ); + } + + #[test] + fn primary_deployment_prefers_failing() { + let states = [ds("alpha", Phase::Running), ds("beta", Phase::Failed)]; + assert_eq!(primary_deployment(&states).as_deref(), Some("beta")); + } + + #[test] + fn version_from_image_tag() { + let score = ReconcileScore::PodmanV0(PodmanV0Score { + services: vec![PodmanService { + name: "web".into(), + image: "nginx:1.25".into(), + ports: vec![], + env: vec![], + volumes: vec![], + restart_policy: Default::default(), + }], + }); + assert_eq!(deployment_version(&score), "1.25"); + } + + #[test] + fn alerts_from_failing_and_stale() { + let devices = vec![DeviceDetail { + id: "pi-09".into(), + status: DeviceStatus::Stale, + last_seen: Utc::now(), + minutes_ago: 14, + deployment: None, + region: "\u{2014}".into(), + tags: vec![], + inventory: None, + }]; + let deployments = vec![DeploymentDetail { + name: "edge".into(), + version: "1.0".into(), + status: DeploymentStatus::Failing, + target: 3, + healthy: 1, + failing: 2, + pending: 0, + updated_at: "\u{2014}".into(), + }]; + let alerts = derive_alerts(&devices, &deployments, &HashSet::new()); + assert_eq!(alerts.len(), 2); + assert!(alerts.iter().any(|a| a.severity == AlertSeverity::Critical)); + assert!(alerts.iter().any(|a| a.severity == AlertSeverity::Warning)); + } +} -- 2.39.5 From 2ed0eccb45908522361e887082b7ebc573d60c6d Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 19:47:54 -0400 Subject: [PATCH 02/11] =?UTF-8?q?refactor(fleet-operator):=20CQRS=20dashbo?= =?UTF-8?q?ard=20=E2=80=94=20operator=20owns=20liveness,=20UI=20reads=20CR?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks the real-data path so the operator is the single write side and the dashboard is a thin read projection over Kubernetes CRs, removing the duplicated derivation the on-demand version carried. Operator (write side): - Device gains a status subresource (DeviceStatus: lastHeartbeat + Reachability). A new device_status reconciler watches device-heartbeat KV and reflects liveness onto Device.status on a tick — the home the CRD doc designated for "device conditions from heartbeat staleness". The staleness threshold now lives in exactly one place. Dashboard (read side): - RealFleetService reads only kube: Device CRs (labels/inventory/status) + Deployment CRs (.status.aggregate). No NATS, no KV scanning, no staleness re-derivation. get_deployment_devices and the per-device deployment column filter Device CRs with the canonical selector_matches over real labels — the lossy label→tag→label round-trip is gone. Deployment: - The operator now serves the dashboard in-process beside the reconcile loop (best-effort; CR-only, so the web side needs no NATS creds and its failure never tears down the controller). The image builds with --features web-frontend so the pod actually serves the UI — it didn't before. serve-web stays for offline (--mock) UI dev. Device-level Failing/Pending move to the deployment view (accurate aggregate counts); per-device status is liveness + blacklist. Unit tests cover liveness reflection, status mapping, version parsing, alerts. --- fleet/harmony-fleet-operator/Dockerfile | 4 +- .../src/device_status.rs | 173 ++++++++++ fleet/harmony-fleet-operator/src/lib.rs | 1 + fleet/harmony-fleet-operator/src/main.rs | 75 +++-- .../src/service/real.rs | 302 +++++------------- harmony/src/modules/fleet/operator/crd.rs | 31 +- harmony/src/modules/fleet/operator/mod.rs | 2 +- 7 files changed, 339 insertions(+), 249 deletions(-) create mode 100644 fleet/harmony-fleet-operator/src/device_status.rs diff --git a/fleet/harmony-fleet-operator/Dockerfile b/fleet/harmony-fleet-operator/Dockerfile index e1ba15be..56e57e19 100644 --- a/fleet/harmony-fleet-operator/Dockerfile +++ b/fleet/harmony-fleet-operator/Dockerfile @@ -31,7 +31,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ WORKDIR /app COPY . . -RUN cargo build --release --locked -p harmony-fleet-operator +# `web-frontend` bundles the dashboard the operator serves in-process +# alongside the reconcile loop. +RUN cargo build --release --locked -p harmony-fleet-operator --features web-frontend FROM docker.io/library/debian:bookworm-slim diff --git a/fleet/harmony-fleet-operator/src/device_status.rs b/fleet/harmony-fleet-operator/src/device_status.rs new file mode 100644 index 00000000..052c9e0b --- /dev/null +++ b/fleet/harmony-fleet-operator/src/device_status.rs @@ -0,0 +1,173 @@ +//! Device liveness reconciler: NATS `device-heartbeat` KV → `Device.status`. +//! +//! Agents ping `heartbeat.` every ~30 s. This reconciler +//! keeps the latest ping per device in memory (from a KV watch) and, +//! on a tick, reflects freshness onto `Device.status` — +//! `lastHeartbeat` + a coarse [`Reachability`]. That makes liveness +//! visible to `kubectl get devices` and lets the dashboard read it +//! from the CR instead of re-deriving staleness from NATS. +//! +//! Mirrors the aggregator's "watch fills a cache, a tick patches the +//! CR" shape. The staleness threshold lives here only — it's a device +//! concept the operator owns, not something every reader re-decides. + +use std::collections::HashMap; +use std::time::Duration; + +use anyhow::Result; +use async_nats::jetstream::kv::{Operation, Store}; +use chrono::{DateTime, Utc}; +use futures_util::StreamExt; +use harmony_reconciler_contracts::{BUCKET_DEVICE_HEARTBEAT, HeartbeatPayload}; +use kube::api::{Api, Patch, PatchParams}; +use kube::{Client, ResourceExt}; +use serde_json::json; +use tokio::sync::Mutex; + +use harmony::modules::fleet::operator::{Device, Reachability}; + +/// A device with no heartbeat within this window is `Stale`. Agents +/// ping every 30 s, so this tolerates ~2 missed pings. +const STALE_AFTER: Duration = Duration::from_secs(90); +/// How often to re-evaluate freshness and patch changed devices. +const TICK: Duration = Duration::from_secs(30); + +pub async fn run(client: Client, js: async_nats::jetstream::Context) -> Result<()> { + let bucket = js + .create_key_value(async_nats::jetstream::kv::Config { + bucket: BUCKET_DEVICE_HEARTBEAT.to_string(), + ..Default::default() + }) + .await?; + + let heartbeats: Mutex>> = Mutex::new(HashMap::new()); + let devices: Api = Api::all(client); + + tokio::try_join!( + watch_heartbeats(&bucket, &heartbeats), + patch_loop(&devices, &heartbeats), + )?; + Ok(()) +} + +async fn watch_heartbeats( + bucket: &Store, + heartbeats: &Mutex>>, +) -> Result<()> { + let mut watch = bucket.watch_with_history(">").await?; + tracing::info!("device-status: watching device-heartbeat KV"); + while let Some(entry_res) = watch.next().await { + let entry = match entry_res { + Ok(e) => e, + Err(e) => { + tracing::warn!(error = %e, "device-status: watch delivery error"); + continue; + } + }; + match entry.operation { + Operation::Put => { + if let Ok(hb) = serde_json::from_slice::(&entry.value) { + heartbeats + .lock() + .await + .insert(hb.device_id.to_string(), hb.at); + } + } + Operation::Delete | Operation::Purge => { + if let Some(id) = entry.key.strip_prefix("heartbeat.") { + heartbeats.lock().await.remove(id); + } + } + } + } + Ok(()) +} + +async fn patch_loop( + devices: &Api, + heartbeats: &Mutex>>, +) -> Result<()> { + // Last status written per device, to skip no-op patches. + let mut applied: HashMap)> = HashMap::new(); + let mut ticker = tokio::time::interval(TICK); + loop { + ticker.tick().await; + let snapshot: Vec<(String, DateTime)> = heartbeats + .lock() + .await + .iter() + .map(|(k, v)| (k.clone(), *v)) + .collect(); + let now = Utc::now(); + for (id, at) in snapshot { + let reachability = reachability(at, now); + if applied.get(&id) == Some(&(reachability, at)) { + continue; + } + if patch_status(devices, &id, reachability, at).await { + applied.insert(id, (reachability, at)); + } + } + } +} + +fn reachability(last_heartbeat: DateTime, now: DateTime) -> Reachability { + if (now - last_heartbeat) + .to_std() + .is_ok_and(|d| d <= STALE_AFTER) + { + Reachability::Reachable + } else { + Reachability::Stale + } +} + +/// Returns whether the patch succeeded (so we only cache applied state +/// on success and retry next tick otherwise). +async fn patch_status( + devices: &Api, + id: &str, + reachability: Reachability, + last_heartbeat: DateTime, +) -> bool { + let status = json!({ + "status": { + "lastHeartbeat": last_heartbeat.to_rfc3339(), + "reachability": reachability, + } + }); + match devices + .patch_status(id, &PatchParams::default(), &Patch::Merge(&status)) + .await + { + Ok(d) => { + tracing::debug!(device = %d.name_any(), ?reachability, "device-status: patched"); + true + } + // A heartbeat can outrace the Device CR's creation by the + // device-reconciler; skip this tick and retry on the next. + Err(kube::Error::Api(ae)) if ae.code == 404 => false, + Err(e) => { + tracing::warn!(%id, error = %e, "device-status: patch failed"); + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn reachable_within_window_stale_after() { + let now = Utc::now(); + assert_eq!( + reachability(now - chrono::Duration::seconds(10), now), + Reachability::Reachable + ); + assert_eq!( + reachability(now - chrono::Duration::seconds(120), now), + Reachability::Stale + ); + } +} diff --git a/fleet/harmony-fleet-operator/src/lib.rs b/fleet/harmony-fleet-operator/src/lib.rs index 8c293a4e..da2f81d6 100644 --- a/fleet/harmony-fleet-operator/src/lib.rs +++ b/fleet/harmony-fleet-operator/src/lib.rs @@ -14,4 +14,5 @@ pub mod commands; pub mod device_reconciler; +pub mod device_status; pub mod fleet_aggregator; diff --git a/fleet/harmony-fleet-operator/src/main.rs b/fleet/harmony-fleet-operator/src/main.rs index e6a1f0de..283d61b8 100644 --- a/fleet/harmony-fleet-operator/src/main.rs +++ b/fleet/harmony-fleet-operator/src/main.rs @@ -5,7 +5,7 @@ mod frontend; #[cfg(feature = "web-frontend")] mod service; -use harmony_fleet_operator::{device_reconciler, fleet_aggregator}; +use harmony_fleet_operator::{device_reconciler, device_status, fleet_aggregator}; use anyhow::{Context, Result}; use async_nats::jetstream; @@ -105,45 +105,45 @@ async fn main() -> Result<()> { addr, css_from, live_reload, - } => { - serve_web( - mock, - addr, - css_from, - live_reload, - &cli.nats_url, - &cli.credentials_toml, - ) - .await - } + } => serve_web(mock, addr, css_from, live_reload).await, } } +/// `serve-web` subcommand: dashboard on its own (mock data, or the live +/// CR-reading service). The deployed operator instead serves the +/// dashboard alongside the reconcile loop — see [`spawn_dashboard`]. #[cfg(feature = "web-frontend")] async fn serve_web( mock: bool, addr: std::net::SocketAddr, css_from: Option, live_reload: bool, - nats_url: &str, - credentials_toml: &str, ) -> Result<()> { use std::sync::Arc; - use std::time::Duration; - use frontend::server::{AppState, Config}; use service::{FleetService, mock::MockFleetService, real::RealFleetService}; let fleet: Arc = if mock { Arc::new(MockFleetService::default()) } else { - // Same NATS + kube sources the reconcile loop reads, projected - // read-only into the dashboard. - let nats = connect_with_retry(nats_url, credentials_toml).await?; - let js = jetstream::new(nats); - let kube = Client::try_default().await?; - Arc::new(RealFleetService::new(kube, js).await?) + Arc::new(RealFleetService::new(Client::try_default().await?)) }; + serve_dashboard(fleet, addr, css_from, live_reload).await +} + +/// Build the Zitadel-authenticated web server around a [`FleetService`] +/// and serve until it exits. Shared by the `serve-web` subcommand and +/// the in-process dashboard the reconcile loop spawns. +#[cfg(feature = "web-frontend")] +async fn serve_dashboard( + fleet: std::sync::Arc, + addr: std::net::SocketAddr, + css_from: Option, + live_reload: bool, +) -> Result<()> { + use std::time::Duration; + + use frontend::server::{AppState, Config}; let cookie_key = harmony_zitadel_auth::cookie_key_from_env(); let config = harmony_zitadel_auth::config_from_env(); @@ -169,6 +169,25 @@ async fn serve_web( .await } +/// Serve the dashboard in-process alongside the reconcile loop. Failure +/// (e.g. Zitadel not yet reachable for JWKS) is logged but never tears +/// down reconcile — the read UI is best-effort, the controller is not. +#[cfg(feature = "web-frontend")] +fn spawn_dashboard(client: Client) { + use std::net::SocketAddr; + use std::sync::Arc; + + use service::real::RealFleetService; + + let addr = SocketAddr::from(([0, 0, 0, 0], frontend::server::DEFAULT_PORT)); + tokio::spawn(async move { + let fleet = Arc::new(RealFleetService::new(client)); + if let Err(e) = serve_dashboard(fleet, addr, None, false).await { + tracing::error!(error = %e, "dashboard server exited; reconcile continues"); + } + }); +} + async fn run(nats_url: &str, bucket: &str, credentials_toml: &str) -> Result<()> { let nats = connect_with_retry(nats_url, credentials_toml).await?; tracing::info!(url = %nats_url, "connected to NATS"); @@ -183,9 +202,16 @@ async fn run(nats_url: &str, bucket: &str, credentials_toml: &str) -> Result<()> let client = Client::try_default().await?; - // Three concurrent tasks: + // Serve the read-only dashboard in the same process (best-effort; + // it reads CRs only, no NATS). Built only with the web-frontend + // feature; absent from the lean reconcile-only image. + #[cfg(feature = "web-frontend")] + spawn_dashboard(client.clone()); + + // Concurrent tasks: // controller — CR validation + finalizer-cleanup // device_reconciler — NATS device-info → Device CR + // device_status — NATS device-heartbeat → Device.status liveness // fleet_aggregator — watches Deployments + Devices + states, // writes desired-state KV, patches CR status // Any failing tears the process down; kube-rs Controller swallows @@ -193,9 +219,12 @@ async fn run(nats_url: &str, bucket: &str, credentials_toml: &str) -> Result<()> let ctl_client = client.clone(); let dr_client = client.clone(); let dr_js = js.clone(); + let ds_client = client.clone(); + let ds_js = js.clone(); tokio::select! { r = controller::run(ctl_client, desired_state_kv) => r, r = device_reconciler::run(dr_client, dr_js) => r, + r = device_status::run(ds_client, ds_js) => r, r = fleet_aggregator::run(client, js) => r, } } diff --git a/fleet/harmony-fleet-operator/src/service/real.rs b/fleet/harmony-fleet-operator/src/service/real.rs index 2433a0a5..5e48f59f 100644 --- a/fleet/harmony-fleet-operator/src/service/real.rs +++ b/fleet/harmony-fleet-operator/src/service/real.rs @@ -1,69 +1,51 @@ -//! Live [`FleetService`] backed by Kubernetes CRs + NATS KV. +//! Live [`FleetService`] as a read-only projection of Kubernetes CRs. //! -//! Read-only projections of the same sources the reconcile loop owns: -//! - `Device` / `Deployment` CRs (kube) — registry + desired intent -//! and the aggregator-maintained `.status.aggregate`. -//! - `device-heartbeat` KV — liveness, drives device status + last-ping. -//! - `device-state` KV — per-(device,deployment) phase, drives the -//! device's primary deployment + failing/pending derivation. -//! -//! Reads are on-demand per request (fine at staging scale); a cache can -//! follow if device counts grow. Nothing here writes desired state — -//! the only mutation is the blacklist label patch. +//! The operator is the write side: `device_reconciler` materializes +//! `Device` CRs (labels + inventory), `device_status` reflects liveness +//! onto `Device.status`, and `fleet_aggregator` writes +//! `Deployment.status.aggregate`. This dashboard is the read side — it +//! only reads those CRs and projects them to view DTOs. No NATS: the +//! CR is the single contract between the two sides. -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::sync::Mutex; use anyhow::Context as _; -use async_nats::jetstream::{self, kv::Store}; use async_trait::async_trait; use chrono::{DateTime, Utc}; -use futures_util::StreamExt; use kube::api::{Api, ListParams, Patch, PatchParams}; use kube::{Client, ResourceExt}; use serde_json::json; -use harmony::modules::fleet::operator::{Deployment as DeploymentCr, Device as DeviceCr}; +use harmony::modules::fleet::operator::{ + Deployment as DeploymentCr, Device as DeviceCr, DeviceStatus as DeviceLiveness, Reachability, +}; use harmony::modules::podman::ReconcileScore; use harmony_fleet_operator::fleet_aggregator::selector_matches; -use harmony_reconciler_contracts::{ - BUCKET_DEVICE_HEARTBEAT, BUCKET_DEVICE_STATE, DeploymentState, HeartbeatPayload, Phase, -}; use super::{ Alert, AlertSeverity, DashboardDetail, DeploymentDetail, DeploymentStatus, DeviceDetail, DeviceStatus, FleetService, }; -/// Label the operator UI sets to quarantine a device. Read back here to -/// surface `Blacklisted` status; the reconcile-side exclusion is a -/// follow-up. +/// Label the operator UI sets to quarantine a device. const BLACKLIST_LABEL: &str = "fleet.nationtech.io/blacklisted"; /// Label key the agent uses for a device's region, if it sets one. const REGION_LABEL: &str = "region"; -/// A device that hasn't pinged within this window is `Stale`. Agents -/// heartbeat every 30 s, so this tolerates ~2 missed pings. -const STALE_AFTER_SECS: i64 = 90; pub struct RealFleetService { kube: Client, - state_kv: Store, - heartbeat_kv: Store, - /// In-memory ack set. Alerts are derived from live state and have - /// no store of their own yet, so acks don't survive a restart. + /// In-memory ack set. Alerts are derived from live CR state and + /// have no store of their own, so acks don't survive a restart. acked_alerts: Mutex>, } impl RealFleetService { - pub async fn new(kube: Client, js: jetstream::Context) -> anyhow::Result { - let state_kv = open_kv(&js, BUCKET_DEVICE_STATE).await?; - let heartbeat_kv = open_kv(&js, BUCKET_DEVICE_HEARTBEAT).await?; - Ok(Self { + pub fn new(kube: Client) -> Self { + Self { kube, - state_kv, - heartbeat_kv, acked_alerts: Mutex::new(HashSet::new()), - }) + } } async fn device_crs(&self) -> anyhow::Result> { @@ -76,55 +58,14 @@ impl RealFleetService { Ok(api.list(&ListParams::default()).await?.items) } - /// device_id → last heartbeat timestamp. - async fn heartbeats(&self) -> HashMap> { - let mut out = HashMap::new(); - let Ok(mut keys) = self.heartbeat_kv.keys().await else { - return out; - }; - while let Some(Ok(key)) = keys.next().await { - if let Ok(Some(bytes)) = self.heartbeat_kv.get(&key).await - && let Ok(hb) = serde_json::from_slice::(&bytes) - { - out.insert(hb.device_id.to_string(), hb.at); - } - } - out - } - - /// device_id → its per-deployment states. - async fn states_by_device(&self) -> HashMap> { - let mut out: HashMap> = HashMap::new(); - let Ok(mut keys) = self.state_kv.keys().await else { - return out; - }; - while let Some(Ok(key)) = keys.next().await { - if let Ok(Some(bytes)) = self.state_kv.get(&key).await - && let Ok(ds) = serde_json::from_slice::(&bytes) - { - out.entry(ds.device_id.to_string()).or_default().push(ds); - } - } - out - } - - /// Assemble every device with its heartbeat + state joined in. async fn devices(&self) -> anyhow::Result> { - let crs = self.device_crs().await?; - let heartbeats = self.heartbeats().await; - let states = self.states_by_device().await; + let deployments = self.deployment_crs().await?; let now = Utc::now(); - let mut devices: Vec = crs + let mut devices: Vec = self + .device_crs() + .await? .iter() - .map(|cr| { - let id = cr.name_any(); - map_device( - cr, - heartbeats.get(&id).copied(), - states.get(&id).map(Vec::as_slice).unwrap_or(&[]), - now, - ) - }) + .map(|cr| map_device(cr, &deployments, now)) .collect(); devices.sort_by(|a, b| a.id.cmp(&b.id)); Ok(devices) @@ -142,34 +83,24 @@ impl RealFleetService { } } -async fn open_kv(js: &jetstream::Context, bucket: &str) -> anyhow::Result { - js.create_key_value(jetstream::kv::Config { - bucket: bucket.to_string(), - ..Default::default() - }) - .await - .with_context(|| format!("opening KV bucket {bucket}")) -} - -fn map_device( - cr: &DeviceCr, - heartbeat: Option>, - states: &[DeploymentState], - now: DateTime, -) -> DeviceDetail { - let id = cr.name_any(); +fn map_device(cr: &DeviceCr, deployments: &[DeploymentCr], now: DateTime) -> DeviceDetail { let labels = cr.metadata.labels.clone().unwrap_or_default(); let blacklisted = labels.get(BLACKLIST_LABEL).map(String::as_str) == Some("true"); - let last_seen = heartbeat + let last_seen = cr + .status + .as_ref() + .and_then(|s| s.last_heartbeat.as_deref()) + .and_then(|s| DateTime::parse_from_rfc3339(s).ok()) + .map(|dt| dt.with_timezone(&Utc)) .or_else(|| cr.creation_timestamp().map(|t| t.0)) .unwrap_or(now); DeviceDetail { - id, - status: derive_status(blacklisted, heartbeat, states, now), + id: cr.name_any(), + status: device_status(blacklisted, cr.status.as_ref()), last_seen, minutes_ago: (now - last_seen).num_minutes().max(0), - deployment: primary_deployment(states), + deployment: primary_deployment(&labels, deployments), region: labels .get(REGION_LABEL) .cloned() @@ -179,44 +110,37 @@ fn map_device( } } -fn derive_status( - blacklisted: bool, - heartbeat: Option>, - states: &[DeploymentState], - now: DateTime, -) -> DeviceStatus { +/// UI status from the device's quarantine label + operator-written +/// liveness. Failing/Pending are deployment-level (the Deployments +/// view), not device liveness, so they don't appear here. +fn device_status(blacklisted: bool, liveness: Option<&DeviceLiveness>) -> DeviceStatus { if blacklisted { return DeviceStatus::Blacklisted; } - let Some(at) = heartbeat else { - return DeviceStatus::Unknown; - }; - if (now - at).num_seconds() > STALE_AFTER_SECS { - return DeviceStatus::Stale; + match liveness.map(|s| s.reachability) { + Some(Reachability::Reachable) => DeviceStatus::Healthy, + Some(Reachability::Stale) => DeviceStatus::Stale, + Some(Reachability::Unknown) | None => DeviceStatus::Unknown, } - if states.iter().any(|s| s.phase == Phase::Failed) { - return DeviceStatus::Failing; - } - if states.iter().any(|s| s.phase == Phase::Pending) { - return DeviceStatus::Pending; - } - DeviceStatus::Healthy } -/// The device's primary deployment for the list column: a failing one -/// if any (most worth surfacing), else the first by name. -fn primary_deployment(states: &[DeploymentState]) -> Option { - let mut sorted: Vec<&DeploymentState> = states.iter().collect(); - sorted.sort_by(|a, b| a.deployment.as_str().cmp(b.deployment.as_str())); - sorted +/// First deployment (by name) whose selector matches the device — the +/// canonical [`selector_matches`] over CR labels, the same matcher the +/// aggregator uses. No reconstruction. +fn primary_deployment( + labels: &BTreeMap, + deployments: &[DeploymentCr], +) -> Option { + let mut matched: Vec = deployments .iter() - .find(|s| s.phase == Phase::Failed) - .or_else(|| sorted.first()) - .map(|s| s.deployment.to_string()) + .filter(|d| selector_matches(&d.spec.target_selector, labels)) + .map(ResourceExt::name_any) + .collect(); + matched.sort(); + matched.into_iter().next() } -/// Routing labels rendered as `k=v` chips, minus the internal -/// blacklist marker. +/// Routing labels rendered as `k=v` chips, minus internal keys. fn tags_from_labels(labels: &BTreeMap) -> Vec { labels .iter() @@ -268,8 +192,8 @@ fn deployment_version(score: &ReconcileScore) -> String { .unwrap_or_else(|| "\u{2014}".to_string()) } -/// Alerts derived from current state (no alert store yet): one critical -/// per failing deployment, one warning per stale device. +/// Alerts derived from current CR state (no alert store yet): one +/// critical per failing deployment, one warning per stale device. fn derive_alerts( devices: &[DeviceDetail], deployments: &[DeploymentDetail], @@ -330,12 +254,7 @@ impl FleetService for RealFleetService { health_pct: 0, attention_devices: devices .iter() - .filter(|d| { - matches!( - d.status, - DeviceStatus::Failing | DeviceStatus::Stale | DeviceStatus::Pending - ) - }) + .filter(|d| matches!(d.status, DeviceStatus::Stale | DeviceStatus::Unknown)) .take(12) .cloned() .collect(), @@ -391,25 +310,20 @@ impl FleetService for RealFleetService { } async fn get_deployment_devices(&self, name: &str) -> anyhow::Result> { - let Some(cr) = self - .deployment_crs() - .await? - .into_iter() - .find(|c| c.name_any() == name) - else { + let deployments = self.deployment_crs().await?; + let Some(cr) = deployments.iter().find(|c| c.name_any() == name) else { return Ok(Vec::new()); }; - let selector = cr.spec.target_selector; + let selector = cr.spec.target_selector.clone(); + let now = Utc::now(); Ok(self - .devices() + .device_crs() .await? - .into_iter() - .filter(|d| { - // Re-derive the label set from tags (`k=v`) to match the - // CR selector — the aggregator's authoritative path. - let labels = labels_from_tags(&d.tags, &d.region); - selector_matches(&selector, &labels) + .iter() + .filter(|dev| { + selector_matches(&selector, &dev.metadata.labels.clone().unwrap_or_default()) }) + .map(|dev| map_device(dev, &deployments, now)) .collect()) } @@ -437,8 +351,7 @@ impl FleetService for RealFleetService { async fn run_command(&self, _device_id: &str, command: &str) -> anyhow::Result { // Seam only: the device round-trip (publish to the agent over - // NATS, stream stdout/stderr back) is a follow-up that needs - // agent-side support. + // NATS, stream stdout/stderr back) needs agent-side support. Ok(format!( "$ {command}\n[device command transport not implemented yet]" )) @@ -474,90 +387,33 @@ impl FleetService for RealFleetService { } } -/// Inverse of [`tags_from_labels`] + the region label, so a -/// `DeviceDetail` can be re-matched against a CR `LabelSelector`. -fn labels_from_tags(tags: &[String], region: &str) -> BTreeMap { - let mut labels: BTreeMap = tags - .iter() - .filter_map(|t| { - t.split_once('=') - .map(|(k, v)| (k.to_string(), v.to_string())) - }) - .collect(); - if region != "\u{2014}" { - labels.insert(REGION_LABEL.to_string(), region.to_string()); - } - labels -} - #[cfg(test)] mod tests { use super::*; use harmony::modules::podman::{PodmanService, PodmanV0Score}; - use harmony_reconciler_contracts::{DeploymentName, Id}; - fn ds(deployment: &str, phase: Phase) -> DeploymentState { - DeploymentState { - device_id: Id::from("pi-01".to_string()), - deployment: DeploymentName::try_new(deployment).unwrap(), - phase, - last_event_at: Utc::now(), - last_error: None, + fn liveness(r: Reachability) -> DeviceLiveness { + DeviceLiveness { + last_heartbeat: None, + reachability: r, } } #[test] - fn status_blacklist_beats_everything() { + fn device_status_maps_liveness_and_blacklist() { assert_eq!( - derive_status( - true, - Some(Utc::now()), - &[ds("a", Phase::Running)], - Utc::now() - ), + device_status(true, Some(&liveness(Reachability::Reachable))), DeviceStatus::Blacklisted ); - } - - #[test] - fn status_no_heartbeat_is_unknown() { assert_eq!( - derive_status(false, None, &[], Utc::now()), - DeviceStatus::Unknown - ); - } - - #[test] - fn status_stale_when_heartbeat_old() { - let old = Utc::now() - chrono::Duration::seconds(STALE_AFTER_SECS + 10); - assert_eq!( - derive_status(false, Some(old), &[ds("a", Phase::Running)], Utc::now()), - DeviceStatus::Stale - ); - } - - #[test] - fn status_failing_then_pending_then_healthy() { - let now = Utc::now(); - let fresh = Some(now); - assert_eq!( - derive_status(false, fresh, &[ds("a", Phase::Failed)], now), - DeviceStatus::Failing - ); - assert_eq!( - derive_status(false, fresh, &[ds("a", Phase::Pending)], now), - DeviceStatus::Pending - ); - assert_eq!( - derive_status(false, fresh, &[ds("a", Phase::Running)], now), + device_status(false, Some(&liveness(Reachability::Reachable))), DeviceStatus::Healthy ); - } - - #[test] - fn primary_deployment_prefers_failing() { - let states = [ds("alpha", Phase::Running), ds("beta", Phase::Failed)]; - assert_eq!(primary_deployment(&states).as_deref(), Some("beta")); + assert_eq!( + device_status(false, Some(&liveness(Reachability::Stale))), + DeviceStatus::Stale + ); + assert_eq!(device_status(false, None), DeviceStatus::Unknown); } #[test] diff --git a/harmony/src/modules/fleet/operator/crd.rs b/harmony/src/modules/fleet/operator/crd.rs index b1e33cf4..bdf027cd 100644 --- a/harmony/src/modules/fleet/operator/crd.rs +++ b/harmony/src/modules/fleet/operator/crd.rs @@ -95,7 +95,8 @@ pub struct AggregateLastError { version = "v1alpha1", kind = "Device", plural = "devices", - shortname = "fleetdev" + shortname = "fleetdev", + status = "DeviceStatus" )] #[serde(rename_all = "camelCase")] pub struct DeviceSpec { @@ -104,3 +105,31 @@ pub struct DeviceSpec { #[serde(skip_serializing_if = "Option::is_none")] pub inventory: Option, } + +/// Operator-maintained liveness reflection of the NATS +/// `device-heartbeat` bucket onto the CR, so `kubectl get devices` and +/// the dashboard see reachability without reading NATS. Written by the +/// device-status reconciler. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DeviceStatus { + /// RFC 3339 timestamp of the last heartbeat seen. `None` until the + /// device has pinged at least once. + #[serde(skip_serializing_if = "Option::is_none")] + pub last_heartbeat: Option, + pub reachability: Reachability, +} + +/// Coarse liveness derived from heartbeat freshness. Failing/Pending +/// are deployment-level concerns (see [`DeploymentAggregate`]), not +/// device liveness. +#[derive(Serialize, Deserialize, Clone, Copy, Debug, Default, PartialEq, Eq, JsonSchema)] +pub enum Reachability { + /// No heartbeat seen yet. + #[default] + Unknown, + /// Heartbeat within the freshness window. + Reachable, + /// Last heartbeat is older than the freshness window. + Stale, +} diff --git a/harmony/src/modules/fleet/operator/mod.rs b/harmony/src/modules/fleet/operator/mod.rs index 210835c0..0c89a456 100644 --- a/harmony/src/modules/fleet/operator/mod.rs +++ b/harmony/src/modules/fleet/operator/mod.rs @@ -13,5 +13,5 @@ pub mod crd; pub use crd::{ AggregateLastError, Deployment, DeploymentAggregate, DeploymentSpec, DeploymentStatus, Device, - DeviceSpec, Rollout, RolloutStrategy, + DeviceSpec, DeviceStatus, Reachability, Rollout, RolloutStrategy, }; -- 2.39.5 From 17c56d29975ccb6c18919b63d90c4263c85466c8 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 21:09:28 -0400 Subject: [PATCH 03/11] feat(fleet): one-command dev build+deploy loop for the operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds fleet/scripts/dev-deploy-operator.sh: a unique semver-dev version drives harmony-fleet-publish (docker-build the web-frontend image, generate + push the chart) then harmony-fleet-deploy (helm upgrade --install onto staging with the dashboard ingress + Service + cert). Skips the git-tag → CI → release ceremony for fast iteration; a unique version per run sidesteps mutable-:dev image/chart cache traps. Dockerfile: BuildKit cache mounts for the cargo registry + target/, so the iterate loop recompiles only changed crates instead of the whole workspace. Build-time only — image is identical, cold CI just rebuilds. chart test: the empty-Secret removal (662ef395) left chart_includes_credentials_secret_and_env_var asserting a file that no longer exists. Reframed to assert the hydrated chart omits the Secret while the Deployment still references the out-of-band one. --- .../src/operator/chart.rs | 19 +++-- fleet/harmony-fleet-operator/Dockerfile | 17 +++- fleet/scripts/dev-deploy-operator.sh | 77 +++++++++++++++++++ 3 files changed, 106 insertions(+), 7 deletions(-) create mode 100755 fleet/scripts/dev-deploy-operator.sh diff --git a/fleet/harmony-fleet-deploy/src/operator/chart.rs b/fleet/harmony-fleet-deploy/src/operator/chart.rs index f870a21b..ad3ac2c1 100644 --- a/fleet/harmony-fleet-deploy/src/operator/chart.rs +++ b/fleet/harmony-fleet-deploy/src/operator/chart.rs @@ -532,7 +532,7 @@ mod tests { } #[test] - fn chart_includes_credentials_secret_and_env_var() { + fn chart_omits_secret_but_deployment_references_it() { let tmp = tempfile::tempdir().unwrap(); let chart_path = build_chart(&ChartOptions { output_dir: tmp.path().to_path_buf(), @@ -540,11 +540,20 @@ mod tests { }) .unwrap(); - let secret_yaml = - std::fs::read_to_string(chart_path.join("templates/secret-credentials.yaml")) - .expect("secret-credentials.yaml must exist in chart"); - assert!(secret_yaml.contains(SECRET_NAME)); + // The chart is hydrated (no templating), so it must NOT carry the + // credentials Secret — the deploy applies it out-of-band. A Secret + // in the chart fights Helm ownership against the deploy-applied one + // and re-blanks it on every upgrade. + assert!( + !chart_path + .join("templates/secret-credentials.yaml") + .exists(), + "hydrated chart must not contain a Secret" + ); + // The Deployment still wires the env var from that out-of-band + // Secret via secretKeyRef (optional, so the Pod starts before it + // lands). let deployment_yaml = std::fs::read_to_string(chart_path.join("templates/deployment.yaml")) .expect("deployment.yaml must exist in chart"); assert!(deployment_yaml.contains(OPERATOR_CREDENTIALS_ENV_VAR)); diff --git a/fleet/harmony-fleet-operator/Dockerfile b/fleet/harmony-fleet-operator/Dockerfile index 56e57e19..26f1bdec 100644 --- a/fleet/harmony-fleet-operator/Dockerfile +++ b/fleet/harmony-fleet-operator/Dockerfile @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1.7 # Multi-stage container build for harmony-fleet-operator. # # Build context is the workspace root (the operator's Cargo.toml has @@ -33,7 +34,19 @@ COPY . . # `web-frontend` bundles the dashboard the operator serves in-process # alongside the reconcile loop. -RUN cargo build --release --locked -p harmony-fleet-operator --features web-frontend +# +# BuildKit cache mounts persist the cargo registry + `target/` across +# builds, so an iterate-build-deploy loop recompiles only changed crates +# (seconds) instead of the whole workspace (minutes). The mounts are +# build-time only — the resulting image is identical, and a cold CI +# runner just rebuilds from scratch. The binary is copied out of the +# cache-mounted target within the same RUN, since cache mounts aren't +# part of the produced layer. +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/usr/local/cargo/git \ + --mount=type=cache,target=/app/target \ + cargo build --release --locked -p harmony-fleet-operator --features web-frontend \ + && cp /app/target/release/harmony-fleet-operator /harmony-fleet-operator FROM docker.io/library/debian:bookworm-slim @@ -43,7 +56,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ && rm -rf /var/lib/apt/lists/* -COPY --from=builder /app/target/release/harmony-fleet-operator /usr/local/bin/harmony-fleet-operator +COPY --from=builder /harmony-fleet-operator /usr/local/bin/harmony-fleet-operator # Non-root runtime. Pairs with the Pod's `securityContext. # runAsNonRoot: true` in the helm chart — k8s admission rejects pods diff --git a/fleet/scripts/dev-deploy-operator.sh b/fleet/scripts/dev-deploy-operator.sh new file mode 100755 index 00000000..776a0fd1 --- /dev/null +++ b/fleet/scripts/dev-deploy-operator.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Build + push + deploy the fleet operator to staging in one shot — the +# fast inner loop that skips the git-tag → CI → release ceremony. +# +# It runs the SAME two binaries the release/CD path uses, just with a +# throwaway dev version: +# 1. harmony-fleet-publish — docker-build the operator image (with the +# web dashboard), generate the helm chart, push BOTH to the OCI +# registry. +# 2. harmony-fleet-deploy — helm upgrade --install that chart onto +# staging, applying the dashboard ingress + Service + cert-manager +# cert (host/issuer come from FleetDeployConfig). +# +# Each run gets a unique semver-dev version, so the node always pulls a +# fresh image and helm rolls a new ReplicaSet — no `:dev`-tag cache traps. +# The first build is a full compile; thereafter the Dockerfile's BuildKit +# cache mounts recompile only what changed (seconds). +# +# Usage: +# ./fleet/scripts/dev-deploy-operator.sh +# REGISTRY=hub.nationtech.io PROJECT=harmony ./fleet/scripts/dev-deploy-operator.sh +# PUBLISH_ONLY=1 ./fleet/scripts/dev-deploy-operator.sh # build+push, skip deploy +# +# Prerequisites (same as the release path): +# - docker (BuildKit) + helm on PATH, logged in to $REGISTRY. +# - The deploy reads FleetDeployConfig + FleetDeploySecrets for +# $CONFIG_NAMESPACE (Env → OpenBao). Provide them however you already +# do for `harmony-fleet-deploy` — e.g. a sourced `.envrc` with +# OPENBAO_URL / OPENBAO_TOKEN / HARMONY_CONFIG_NAMESPACE, and a +# KUBECONFIG pointed at staging (or a kubeconfig seeded in OpenBao). + +set -euo pipefail + +REGISTRY="${REGISTRY:-hub.nationtech.io}" +PROJECT="${PROJECT:-harmony}" +CONFIG_NAMESPACE="${CONFIG_NAMESPACE:-fleet-staging}" +PUBLISH_ONLY="${PUBLISH_ONLY:-0}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$REPO_ROOT" + +# Pick up a local .envrc (OpenBao creds, KUBECONFIG, …) if present and +# not already loaded by direnv. +if [[ -f "$REPO_ROOT/.envrc" ]]; then + # shellcheck disable=SC1091 + source "$REPO_ROOT/.envrc" +fi + +# Unique + semver-valid: helm rejects a non-semver chart version, and a +# fresh version forces a clean image pull + rollout every iteration. +VERSION="0.0.0-dev.$(date +%s)" +TAG="harmony-fleet-operator-v${VERSION}" # publish parses the version back out + +export DOCKER_BUILDKIT=1 + +echo "==> [1/2] build + push image and chart @ ${VERSION}" +cargo run -q -p harmony-fleet-deploy --bin harmony-fleet-publish -- \ + --from-tag "$TAG" \ + --registry "$REGISTRY" \ + --project "$PROJECT" + +if [[ "$PUBLISH_ONLY" == "1" ]]; then + echo "==> PUBLISH_ONLY=1, skipping deploy. Chart: oci://${REGISTRY}/${PROJECT}/harmony-fleet-operator-chart:${VERSION}" + exit 0 +fi + +echo "==> [2/2] deploy chart @ ${VERSION} to ${CONFIG_NAMESPACE}" +cargo run -q -p harmony-fleet-deploy --bin harmony-fleet-deploy -- \ + --operator-chart-version "$VERSION" \ + --operator-chart-registry "$REGISTRY" \ + --operator-chart-project "$PROJECT" \ + --config-namespace "$CONFIG_NAMESPACE" \ + --yes + +echo +echo "Deployed ${REGISTRY}/${PROJECT}/harmony-fleet-operator:${VERSION} to ${CONFIG_NAMESPACE}." -- 2.39.5 From 08243e218b4a6b3d1771ed72199873473d74e68a Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 21:18:13 -0400 Subject: [PATCH 04/11] feat(fleet): harmony-fleet-publish takes a bare --version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `--from-tag` only exists because CI passes $GITHUB_REF_NAME; making every caller wrap a version in a fake `harmony-fleet-operator-v…` string just to strip it back off was bad UX. Add `--version` (the bare image+chart version), keep `--from-tag` optional for the CI path — symmetric with harmony-fleet-deploy's `--operator-chart-version`/`--from-tag`. The dev script now passes `--version` directly. --- .../src/bin/harmony-fleet-publish.rs | 34 ++++++++++++++----- fleet/scripts/dev-deploy-operator.sh | 3 +- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/fleet/harmony-fleet-deploy/src/bin/harmony-fleet-publish.rs b/fleet/harmony-fleet-deploy/src/bin/harmony-fleet-publish.rs index 00298cec..8d4adefb 100644 --- a/fleet/harmony-fleet-deploy/src/bin/harmony-fleet-publish.rs +++ b/fleet/harmony-fleet-deploy/src/bin/harmony-fleet-publish.rs @@ -1,21 +1,29 @@ //! `harmony-fleet-publish` — build + publish the operator image + chart -//! for a tagged release. `docker` / `helm` must be on PATH and logged in -//! to the registry (CI's login actions; dev's manual login). +//! at a version. `docker` / `helm` must be on PATH and logged in to the +//! registry (CI's login actions; dev's manual login). -use anyhow::Result; +use anyhow::{Result, bail}; use clap::Parser; use harmony_fleet_deploy::release::{release_operator, version_from_tag}; #[derive(Parser, Debug)] #[command( name = "harmony-fleet-publish", - about = "Build + publish the operator image + chart for a tagged release" + about = "Build + publish the operator image + chart at a version" )] struct Cli { - /// Git tag, e.g. `harmony-fleet-operator-v0.1.0`. Defaults to - /// `$GITHUB_REF_NAME` so CI passes nothing. + /// The image + chart version, e.g. `0.1.0` or `0.0.0-dev.1730000000` + /// (must be valid semver — helm rejects anything else). Wins over + /// `--from-tag`; the laptop/dev path. + #[arg(long)] + version: Option, + + /// Parse the version out of a release tag like + /// `harmony-fleet-operator-v0.1.0`. Defaults to `$GITHUB_REF_NAME` + /// so the release workflow passes nothing. Used when `--version` + /// is unset. #[arg(long, env = "GITHUB_REF_NAME")] - from_tag: String, + from_tag: Option, #[arg(long, default_value = "hub.nationtech.io")] registry: String, @@ -28,9 +36,19 @@ struct Cli { no_push: bool, } +impl Cli { + fn version(&self) -> Result { + match (&self.version, &self.from_tag) { + (Some(v), _) => Ok(v.clone()), + (None, Some(tag)) => version_from_tag(tag), + (None, None) => bail!("set --version (e.g. 0.1.0) or --from-tag"), + } + } +} + fn main() -> Result<()> { env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); let cli = Cli::parse(); - let version = version_from_tag(&cli.from_tag)?; + let version = cli.version()?; release_operator(&version, &cli.registry, &cli.project, !cli.no_push) } diff --git a/fleet/scripts/dev-deploy-operator.sh b/fleet/scripts/dev-deploy-operator.sh index 776a0fd1..44635386 100755 --- a/fleet/scripts/dev-deploy-operator.sh +++ b/fleet/scripts/dev-deploy-operator.sh @@ -50,13 +50,12 @@ fi # Unique + semver-valid: helm rejects a non-semver chart version, and a # fresh version forces a clean image pull + rollout every iteration. VERSION="0.0.0-dev.$(date +%s)" -TAG="harmony-fleet-operator-v${VERSION}" # publish parses the version back out export DOCKER_BUILDKIT=1 echo "==> [1/2] build + push image and chart @ ${VERSION}" cargo run -q -p harmony-fleet-deploy --bin harmony-fleet-publish -- \ - --from-tag "$TAG" \ + --version "$VERSION" \ --registry "$REGISTRY" \ --project "$PROJECT" -- 2.39.5 From 78bb5d77d8e63e452ce5f3972f90202a31520b17 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 22:07:58 -0400 Subject: [PATCH 05/11] feat(fleet-operator): dashboard SSO config via ConfigClient, not env soup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The auth code (Reda's, proven locally) read 7 FLEET_AUTH_* env vars at the pod. Replace that with one typed Config value each, loaded the Harmony way. - harmony_zitadel_auth: ZitadelAuthConfig is now a `Config` (Serialize/ Deserialize/JsonSchema). Add OperatorCookieKey (secret Config) with a base64→Key decode. Drop config_from_env/cookie_key_from_env + the FLEET_AUTH_* consts. - operator: serve_dashboard loads ZitadelAuthConfig + OperatorCookieKey via ConfigClient::for_namespace (EnvSource → OpenBao). No env soup. - deploy: resolves the values (hosts derived from base_domain, client_id + audiences from FleetDeployConfig, cookie key from FleetDeploySecrets) and bakes them into the operator Secret as HARMONY_CONFIG_ JSON. The published chart wires the env→Secret refs at publish time (optional, pod-light); the deploy fills the Secret at deploy time — same pattern as the NATS credentials. A test locks the baked env names to the structs' Config keys. - fleet_staging_install seeds a generated cookie key; dev.sh exports the two HARMONY_CONFIG_* JSON values instead of 7 vars. Dashboard serves once the Zitadel app allows the staging redirect URIs (fleet-stg./auth/callback) — the one remaining non-code step. --- Cargo.lock | 5 ++ examples/fleet_e2e_demo/src/lib.rs | 2 + examples/fleet_staging_install/Cargo.toml | 1 + examples/fleet_staging_install/src/main.rs | 11 +++ fleet/harmony-fleet-deploy/Cargo.toml | 1 + fleet/harmony-fleet-deploy/src/main.rs | 19 +++++- .../src/operator/chart.rs | 68 +++++++++++++++++++ .../src/operator/score.rs | 40 +++++++++++ fleet/harmony-fleet-deploy/src/secrets.rs | 16 +++++ fleet/harmony-fleet-operator/Cargo.toml | 1 + fleet/harmony-fleet-operator/dev.sh | 14 ++-- fleet/harmony-fleet-operator/src/main.rs | 17 ++++- harmony_zitadel_auth/Cargo.toml | 4 +- harmony_zitadel_auth/src/config.rs | 61 +++++++---------- harmony_zitadel_auth/src/lib.rs | 7 +- 15 files changed, 214 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45330c58..0c7bbac5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3498,6 +3498,7 @@ name = "example_fleet_staging_install" version = "0.1.0" dependencies = [ "anyhow", + "base64 0.22.1", "clap", "harmony", "harmony-fleet-deploy", @@ -4104,6 +4105,7 @@ dependencies = [ "harmony_config", "harmony_macros", "harmony_types", + "harmony_zitadel_auth", "k8s-openapi", "kube", "log", @@ -4164,6 +4166,7 @@ dependencies = [ "harmony", "harmony-fleet-auth", "harmony-reconciler-contracts", + "harmony_config", "harmony_zitadel_auth", "k8s-openapi", "kube", @@ -4533,10 +4536,12 @@ dependencies = [ "axum-extra", "base64 0.22.1", "chrono", + "harmony_config", "jsonwebtoken", "openidconnect", "rand 0.9.4", "reqwest 0.12.28", + "schemars 0.8.22", "serde", "serde_json", "sha2 0.10.9", diff --git a/examples/fleet_e2e_demo/src/lib.rs b/examples/fleet_e2e_demo/src/lib.rs index 2f0e78b4..d615ee1b 100644 --- a/examples/fleet_e2e_demo/src/lib.rs +++ b/examples/fleet_e2e_demo/src/lib.rs @@ -680,6 +680,8 @@ key_json = """ log_level: "info,kube_runtime=warn".to_string(), credentials: Some(OperatorCredentials { credentials_toml }), chart_version: None, + web_auth_config_json: None, + web_cookie_key_json: None, }; // CRDs first — the operator watches them on startup. diff --git a/examples/fleet_staging_install/Cargo.toml b/examples/fleet_staging_install/Cargo.toml index 24bdaa4b..f438eafb 100644 --- a/examples/fleet_staging_install/Cargo.toml +++ b/examples/fleet_staging_install/Cargo.toml @@ -20,6 +20,7 @@ harmony-nats-callout = { path = "../../nats/callout" } harmony-fleet-deploy = { path = "../../fleet/harmony-fleet-deploy" } nkeys = "0.4" rand = "0.9" +base64.workspace = true anyhow.workspace = true clap = { version = "4", features = ["derive", "env"] } tokio.workspace = true diff --git a/examples/fleet_staging_install/src/main.rs b/examples/fleet_staging_install/src/main.rs index a3dfa9d8..a2ffd675 100644 --- a/examples/fleet_staging_install/src/main.rs +++ b/examples/fleet_staging_install/src/main.rs @@ -347,6 +347,7 @@ path "secret/metadata/harmony/*" { capabilities = ["list","read"] }"# .set(&FleetDeploySecrets { operator_credentials_toml: credentials.credentials_toml.clone(), kubeconfig: None, + operator_cookie_key_b64: generate_cookie_key_b64(), }) .await .context("seed FleetDeploySecrets")?; @@ -378,3 +379,13 @@ fn generate_alphanum(len: usize) -> String { .map(|_| CHARSET[rng.random_range(0..CHARSET.len())] as char) .collect() } + +/// Standard-base64 of 64 random bytes — the operator's session-cookie +/// signing key. Generated per install and seeded as a secret. +fn generate_cookie_key_b64() -> String { + use base64::Engine; + use rand::Rng; + let mut bytes = [0u8; 64]; + rand::rng().fill(&mut bytes); + base64::engine::general_purpose::STANDARD.encode(bytes) +} diff --git a/fleet/harmony-fleet-deploy/Cargo.toml b/fleet/harmony-fleet-deploy/Cargo.toml index 5b760da4..130d4bd0 100644 --- a/fleet/harmony-fleet-deploy/Cargo.toml +++ b/fleet/harmony-fleet-deploy/Cargo.toml @@ -29,6 +29,7 @@ harmony_types = { path = "../../harmony_types" } harmony_macros = { path = "../../harmony_macros" } harmony-fleet-auth = { path = "../harmony-fleet-auth" } harmony-reconciler-contracts = { path = "../../harmony-reconciler-contracts" } +harmony_zitadel_auth = { path = "../../harmony_zitadel_auth" } anyhow = { workspace = true } async-trait = { workspace = true } diff --git a/fleet/harmony-fleet-deploy/src/main.rs b/fleet/harmony-fleet-deploy/src/main.rs index 97dfa828..0eb9ff95 100644 --- a/fleet/harmony-fleet-deploy/src/main.rs +++ b/fleet/harmony-fleet-deploy/src/main.rs @@ -18,6 +18,7 @@ use harmony_config::ConfigClient; use harmony_fleet_deploy::{ FleetDeployConfig, FleetDeploySecrets, FleetOperatorScore, version_from_tag, }; +use harmony_zitadel_auth::{OperatorCookieKey, ZitadelAuthConfig}; use tracing::info; #[derive(Parser, Debug)] @@ -114,12 +115,28 @@ async fn main() -> Result<()> { // Coherent with the other staging hosts (sso-stg., secrets-stg.). let ui_host = format!("fleet-stg.{}", config.base_domain); + // Dashboard SSO config: hosts derive from base_domain; client_id + + // audiences come from config, cookie key from secrets. Baked into the + // operator Secret for the pod's ConfigClient to read. + let web_auth = ZitadelAuthConfig { + zitadel_base: format!("https://sso-stg.{}", config.base_domain), + base_url: format!("https://{ui_host}"), + client_id: config.operator_oidc_client_id, + scope: "openid profile email".to_string(), + trusted_audiences: config.operator_trusted_audiences, + logout_redirect_uri: format!("https://{ui_host}/"), + }; + let cookie = OperatorCookieKey { + cookie_key_b64: secrets.operator_cookie_key_b64, + }; + let operator = FleetOperatorScore::new() .namespace(namespace) .nats_url(config.nats_url) .credentials(secrets.operator_credentials_toml) .published_chart(registry, project, version) - .ingress(ui_host, Some(config.cluster_issuer)); + .ingress(ui_host, Some(config.cluster_issuer)) + .web_auth(web_auth, cookie); harmony_cli::run( Inventory::autoload(), diff --git a/fleet/harmony-fleet-deploy/src/operator/chart.rs b/fleet/harmony-fleet-deploy/src/operator/chart.rs index ad3ac2c1..7ce91612 100644 --- a/fleet/harmony-fleet-deploy/src/operator/chart.rs +++ b/fleet/harmony-fleet-deploy/src/operator/chart.rs @@ -70,6 +70,13 @@ pub struct ChartOptions { /// sets this to the released tag so the OCI chart artifact lands /// at `…/harmony-fleet-operator-chart:` matching the image tag. pub chart_version: Option, + /// JSON of the dashboard's `ZitadelAuthConfig`, stored in the + /// operator Secret under [`ENV_WEB_AUTH_CONFIG`]. `None` leaves the + /// dashboard unauthenticated (dev/e2e). + pub web_auth_config_json: Option, + /// JSON of the dashboard's `OperatorCookieKey`, stored under + /// [`ENV_WEB_COOKIE_KEY`]. + pub web_cookie_key_json: Option, } /// What the operator pod needs to authenticate to NATS via the auth @@ -134,6 +141,8 @@ impl Default for ChartOptions { log_level: "info,kube_runtime=warn".to_string(), credentials: None, chart_version: None, + web_auth_config_json: None, + web_cookie_key_json: None, } } } @@ -153,6 +162,15 @@ pub const OPERATOR_HTTP_PORT: u16 = 18080; /// including the inline JSON keyfile under `key_json`. pub const SECRET_KEY_CREDENTIALS_TOML: &str = "credentials.toml"; +/// Env var + operator-Secret key the dashboard's `ConfigClient` reads +/// its `ZitadelAuthConfig` / `OperatorCookieKey` from. The value is the +/// JSON of those structs; `ConfigClient`'s `EnvSource` keys a config +/// type under `HARMONY_CONFIG_`. Locked to the structs' Config +/// keys by a test below — the chart wires the env→Secret reference at +/// publish time, the deploy fills the Secret values at deploy time. +pub const ENV_WEB_AUTH_CONFIG: &str = "HARMONY_CONFIG_ZitadelAuthConfig"; +pub const ENV_WEB_COOKIE_KEY: &str = "HARMONY_CONFIG_OperatorCookieKey"; + /// Build + write the chart to `opts.output_dir`. Returns the full /// path to the generated chart directory (which is what `helm /// install ` wants). @@ -215,6 +233,21 @@ pub fn operator_secret(opts: &ChartOptions) -> Option { SECRET_KEY_CREDENTIALS_TOML.to_string(), ByteString(creds.credentials_toml.as_bytes().to_vec()), ); + // Dashboard auth config + cookie key (when configured) ride in the + // same Secret; the Deployment sources them as HARMONY_CONFIG_* env + // for the operator's ConfigClient. + if let Some(json) = &opts.web_auth_config_json { + data.insert( + ENV_WEB_AUTH_CONFIG.to_string(), + ByteString(json.as_bytes().to_vec()), + ); + } + if let Some(json) = &opts.web_cookie_key_json { + data.insert( + ENV_WEB_COOKIE_KEY.to_string(), + ByteString(json.as_bytes().to_vec()), + ); + } // Namespace deliberately omitted — the caller passes the target // namespace to `K8sResourceScore::single`, which injects it at // apply time. Keeps the Secret manifest reusable across @@ -383,6 +416,22 @@ fn operator_deployment(opts: &ChartOptions) -> K8sDeployment { // pod compatible with OKD's restricted-v2 SCC and the // `harmony_fleet_auth::CredentialsSection` deserializer // handles inline-vs-file from the same TOML shape. + // All three env-from-Secret with optional=true: the Pod starts + // before the deploy applies the Secret, and the dashboard's two + // HARMONY_CONFIG_* values are simply absent on deploys that don't + // configure auth (dev/e2e). + let secret_env = |name: &str| EnvVar { + name: name.to_string(), + value_from: Some(EnvVarSource { + secret_key_ref: Some(SecretKeySelector { + name: SECRET_NAME.to_string(), + key: name.to_string(), + optional: Some(true), + }), + ..Default::default() + }), + ..Default::default() + }; env.push(EnvVar { name: OPERATOR_CREDENTIALS_ENV_VAR.to_string(), value_from: Some(EnvVarSource { @@ -395,6 +444,8 @@ fn operator_deployment(opts: &ChartOptions) -> K8sDeployment { }), ..Default::default() }); + env.push(secret_env(ENV_WEB_AUTH_CONFIG)); + env.push(secret_env(ENV_WEB_COOKIE_KEY)); // Namespace deliberately omitted — same rationale as the // ServiceAccount: helm fills in the release namespace at install @@ -559,4 +610,21 @@ mod tests { assert!(deployment_yaml.contains(OPERATOR_CREDENTIALS_ENV_VAR)); assert!(deployment_yaml.contains(SECRET_NAME)); } + + // The chart bakes these env names at publish time; the operator's + // ConfigClient derives them from the struct names at runtime. Lock + // them together so a rename can't silently desync the two. + #[test] + fn web_auth_env_names_match_config_keys() { + use harmony_config::Config; + use harmony_zitadel_auth::{OperatorCookieKey, ZitadelAuthConfig}; + assert_eq!( + ENV_WEB_AUTH_CONFIG, + format!("HARMONY_CONFIG_{}", ZitadelAuthConfig::KEY) + ); + assert_eq!( + ENV_WEB_COOKIE_KEY, + format!("HARMONY_CONFIG_{}", OperatorCookieKey::KEY) + ); + } } diff --git a/fleet/harmony-fleet-deploy/src/operator/score.rs b/fleet/harmony-fleet-deploy/src/operator/score.rs index 94376016..305fb253 100644 --- a/fleet/harmony-fleet-deploy/src/operator/score.rs +++ b/fleet/harmony-fleet-deploy/src/operator/score.rs @@ -78,6 +78,17 @@ pub struct FleetOperatorScore { /// cert-manager `ClusterIssuer` for the UI Ingress. `None` (or no /// host) serves plain HTTP — the right default on issuer-less k3d. pub cluster_issuer: Option, + /// Dashboard SSO config + cookie key, baked into the operator Secret + /// for the pod's `ConfigClient` to read. `None` leaves the dashboard + /// unauthenticated (dev/e2e). + pub web_auth: Option, +} + +/// The dashboard's auth inputs the operator reads via `ConfigClient`. +#[derive(Debug, Clone, Serialize)] +pub struct WebAuth { + pub config: harmony_zitadel_auth::ZitadelAuthConfig, + pub cookie: harmony_zitadel_auth::OperatorCookieKey, } impl FleetOperatorScore { @@ -98,6 +109,7 @@ impl FleetOperatorScore { published_chart: None, operator_ui_host: None, cluster_issuer: None, + web_auth: None, } } @@ -110,6 +122,17 @@ impl FleetOperatorScore { self } + /// Configure dashboard SSO: the `ZitadelAuthConfig` + cookie key are + /// baked into the operator Secret for the pod's `ConfigClient`. + pub fn web_auth( + mut self, + config: harmony_zitadel_auth::ZitadelAuthConfig, + cookie: harmony_zitadel_auth::OperatorCookieKey, + ) -> Self { + self.web_auth = Some(WebAuth { config, cookie }); + self + } + /// Install the published OCI chart at `version` instead of rendering /// one from local source (the CD `harmony apply` path). pub fn published_chart( @@ -202,9 +225,22 @@ impl Interpret for FleetOperatorInterp // directly, not via the chart — it's environment-specific. The // published-chart CD path runs without credentials today, so // this is a no-op there. + let (web_auth_config_json, web_cookie_key_json) = match &self.score.web_auth { + Some(w) => ( + Some(serde_json::to_string(&w.config).map_err(|e| { + InterpretError::new(format!("serialize ZitadelAuthConfig: {e}")) + })?), + Some(serde_json::to_string(&w.cookie).map_err(|e| { + InterpretError::new(format!("serialize OperatorCookieKey: {e}")) + })?), + ), + None => (None, None), + }; if let Some(creds) = &self.score.credentials && let Some(secret) = operator_secret(&ChartOptions { credentials: Some(creds.clone()), + web_auth_config_json, + web_cookie_key_json, ..ChartOptions::default() }) { @@ -252,6 +288,10 @@ impl Interpret for FleetOperatorInterp log_level: self.score.log_level.clone(), credentials: self.score.credentials.clone(), chart_version: None, + // The auth Secret is applied separately above; the + // rendered chart only needs the Deployment env wiring. + web_auth_config_json: None, + web_cookie_key_json: None, }) .map_err(|e| InterpretError::new(format!("build operator chart: {e}")))?; let chart_path_str = chart_path.to_str().ok_or_else(|| { diff --git a/fleet/harmony-fleet-deploy/src/secrets.rs b/fleet/harmony-fleet-deploy/src/secrets.rs index d86dad5c..74768364 100644 --- a/fleet/harmony-fleet-deploy/src/secrets.rs +++ b/fleet/harmony-fleet-deploy/src/secrets.rs @@ -22,6 +22,12 @@ pub struct FleetDeploySecrets { /// `KUBECONFIG`. #[serde(default)] pub kubeconfig: Option, + + /// Operator dashboard session-cookie signing key (standard-base64 of + /// ≥64 bytes). `#[serde(default)]` so older seeds without it still + /// load — but the dashboard won't authenticate until it's set. + #[serde(default)] + pub operator_cookie_key_b64: String, } /// Non-secret deploy config: k8s namespaces + chart coords. Loaded via @@ -52,6 +58,14 @@ pub struct FleetDeployConfig { /// cert-manager `ClusterIssuer` for the operator UI's TLS cert. pub cluster_issuer: String, + + /// Zitadel OIDC app the operator dashboard authenticates against. + #[serde(default)] + pub operator_oidc_client_id: String, + + /// Token audiences the dashboard accepts (Zitadel project/app ids). + #[serde(default)] + pub operator_trusted_audiences: Vec, } impl Default for FleetDeployConfig { @@ -65,6 +79,8 @@ impl Default for FleetDeployConfig { operator_chart_project: "harmony".to_string(), base_domain: "cb1.nationtech.io".to_string(), cluster_issuer: "letsencrypt-prod".to_string(), + operator_oidc_client_id: String::new(), + operator_trusted_audiences: Vec::new(), } } } diff --git a/fleet/harmony-fleet-operator/Cargo.toml b/fleet/harmony-fleet-operator/Cargo.toml index f272a067..90dc0edb 100644 --- a/fleet/harmony-fleet-operator/Cargo.toml +++ b/fleet/harmony-fleet-operator/Cargo.toml @@ -17,6 +17,7 @@ web-frontend = ["dep:axum", "dep:axum-extra", "dep:maud", "dep:tokio-stream", "h harmony = { path = "../../harmony", features = ["podman"] } harmony-fleet-auth = { path = "../harmony-fleet-auth" } harmony-reconciler-contracts = { path = "../../harmony-reconciler-contracts" } +harmony_config = { path = "../../harmony_config" } harmony_zitadel_auth = { path = "../../harmony_zitadel_auth" } toml = { workspace = true } chrono = { workspace = true, features = ["serde"] } diff --git a/fleet/harmony-fleet-operator/dev.sh b/fleet/harmony-fleet-operator/dev.sh index 812f8272..75173f04 100644 --- a/fleet/harmony-fleet-operator/dev.sh +++ b/fleet/harmony-fleet-operator/dev.sh @@ -1,13 +1,11 @@ #!/bin/bash -export BASE_URL=http://localhost:18080 -export FLEET_AUTH_ZITADEL_BASE=https://sso-stg.cb1.nationtech.io -export FLEET_AUTH_CLIENT_ID=372626218874372917 -export FLEET_AUTH_SCOPE="openid profile email" -export FLEET_AUTH_LOGOUT_REDIRECT_URI="http://localhost:18080/" -export FLEET_OPERATOR_COOKIE_KEY_B64=6eKVpj88jwIcmaJajPfohdaIXhSPlfYCrHaOfymTcIWBAIadvhg7NHpMo5vPSMy90vac3cq2liWe1naSgkbaYg== -export FLEET_AUTH_TRUSTED_AUDIENCES=371639797493596981,371683318111994677,372626218874372917,371639797157987125 -export BASE_URL=http://localhost:18080 +# The operator reads its auth config via ConfigClient — one typed value +# each (EnvSource keys a Config struct under HARMONY_CONFIG_), +# not a fistful of FLEET_AUTH_* vars. base_url is localhost for this local +# serve-web; staging derives it from base_domain in the deploy. +export HARMONY_CONFIG_ZitadelAuthConfig='{"zitadel_base":"https://sso-stg.cb1.nationtech.io","base_url":"http://localhost:18080","client_id":"372626218874372917","scope":"openid profile email","trusted_audiences":["371639797493596981","371683318111994677","372626218874372917","371639797157987125"],"logout_redirect_uri":"http://localhost:18080/"}' +export HARMONY_CONFIG_OperatorCookieKey='{"cookie_key_b64":"6eKVpj88jwIcmaJajPfohdaIXhSPlfYCrHaOfymTcIWBAIadvhg7NHpMo5vPSMy90vac3cq2liWe1naSgkbaYg=="}' export RUST_LOG=debug cargo watch -- cargo run -p harmony-fleet-operator --features web-frontend -- serve-web \ diff --git a/fleet/harmony-fleet-operator/src/main.rs b/fleet/harmony-fleet-operator/src/main.rs index 283d61b8..e6ff215d 100644 --- a/fleet/harmony-fleet-operator/src/main.rs +++ b/fleet/harmony-fleet-operator/src/main.rs @@ -144,9 +144,22 @@ async fn serve_dashboard( use std::time::Duration; use frontend::server::{AppState, Config}; + use harmony_zitadel_auth::{OperatorCookieKey, ZitadelAuthConfig}; - let cookie_key = harmony_zitadel_auth::cookie_key_from_env(); - let config = harmony_zitadel_auth::config_from_env(); + // One typed config value each, resolved EnvSource → OpenBao by + // ConfigClient — not a fistful of FLEET_AUTH_* env vars. Namespace + // only matters for the OpenBao source; EnvSource reads regardless. + let ns = std::env::var("HARMONY_CONFIG_NAMESPACE").unwrap_or_else(|_| "fleet-staging".into()); + let cc = harmony_config::ConfigClient::for_namespace(&ns).await; + let config: ZitadelAuthConfig = cc + .get() + .await + .context("loading ZitadelAuthConfig (HARMONY_CONFIG_ZitadelAuthConfig or OpenBao)")?; + let cookie_key = cc + .get::() + .await + .context("loading OperatorCookieKey")? + .key()?; let http_client = reqwest::Client::new(); let jwks = harmony_zitadel_auth::JwksCache::new(&config.zitadel_base, http_client.clone()) diff --git a/harmony_zitadel_auth/Cargo.toml b/harmony_zitadel_auth/Cargo.toml index 7fef2c30..a0ee0c27 100644 --- a/harmony_zitadel_auth/Cargo.toml +++ b/harmony_zitadel_auth/Cargo.toml @@ -10,12 +10,14 @@ default = [] axum = ["dep:axum", "dep:axum-extra"] [dependencies] +harmony_config = { path = "../harmony_config" } anyhow.workspace = true base64.workspace = true chrono = { workspace = true, features = ["serde"] } rand.workspace = true reqwest.workspace = true -serde.workspace = true +schemars = "0.8" +serde = { workspace = true, features = ["derive"] } serde_json.workspace = true sha2 = "0.10" url.workspace = true diff --git a/harmony_zitadel_auth/src/config.rs b/harmony_zitadel_auth/src/config.rs index 52019426..c09ad04d 100644 --- a/harmony_zitadel_auth/src/config.rs +++ b/harmony_zitadel_auth/src/config.rs @@ -1,4 +1,11 @@ -#[derive(Debug, Clone)] +use harmony_config::Config; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Operator web-frontend auth config. Loaded via +/// [`harmony_config::ConfigClient`] (env → OpenBao), so the operator +/// reads one typed value instead of a fistful of `FLEET_AUTH_*` env vars. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Config)] pub struct ZitadelAuthConfig { pub zitadel_base: String, pub base_url: String, @@ -33,43 +40,27 @@ impl ZitadelAuthConfig { } } -pub const ZITADEL_BASE_ENV: &str = "FLEET_AUTH_ZITADEL_BASE"; -pub const BASE_URL_ENV: &str = "BASE_URL"; -pub const CLIENT_ID_ENV: &str = "FLEET_AUTH_CLIENT_ID"; -pub const SCOPE_ENV: &str = "FLEET_AUTH_SCOPE"; -pub const TRUSTED_AUDIENCES_ENV: &str = "FLEET_AUTH_TRUSTED_AUDIENCES"; -pub const LOGOUT_REDIRECT_URI_ENV: &str = "FLEET_AUTH_LOGOUT_REDIRECT_URI"; -pub const COOKIE_KEY_ENV: &str = "FLEET_OPERATOR_COOKIE_KEY_B64"; - -pub fn config_from_env() -> ZitadelAuthConfig { - ZitadelAuthConfig { - zitadel_base: required_env(ZITADEL_BASE_ENV), - base_url: required_env(BASE_URL_ENV), - client_id: required_env(CLIENT_ID_ENV), - scope: required_env(SCOPE_ENV), - trusted_audiences: required_env(TRUSTED_AUDIENCES_ENV) - .split(',') - .map(str::to_string) - .collect(), - logout_redirect_uri: required_env(LOGOUT_REDIRECT_URI_ENV), - } +/// Operator session-cookie signing key: standard-base64 of ≥64 random +/// bytes. Secret-class, so it resolves from OpenBao / a k8s Secret — +/// never cleartext config. Whoever holds it can forge sessions. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Config)] +#[config(secret)] +pub struct OperatorCookieKey { + pub cookie_key_b64: String, } #[cfg(feature = "axum")] -pub fn cookie_key_from_env() -> axum_extra::extract::cookie::Key { - use base64::Engine; - use base64::engine::general_purpose::STANDARD; +impl OperatorCookieKey { + pub fn key(&self) -> anyhow::Result { + use base64::Engine; + use base64::engine::general_purpose::STANDARD; - let encoded = required_env(COOKIE_KEY_ENV); - let bytes = STANDARD - .decode(encoded.trim()) - .unwrap_or_else(|e| panic!("{COOKIE_KEY_ENV} must be standard base64: {e}")); - if bytes.len() < 64 { - panic!("{COOKIE_KEY_ENV} must decode to at least 64 bytes for private cookies"); + let bytes = STANDARD + .decode(self.cookie_key_b64.trim()) + .map_err(|e| anyhow::anyhow!("cookie_key_b64 must be standard base64: {e}"))?; + if bytes.len() < 64 { + anyhow::bail!("cookie_key_b64 must decode to at least 64 bytes for private cookies"); + } + Ok(axum_extra::extract::cookie::Key::from(&bytes)) } - axum_extra::extract::cookie::Key::from(&bytes) -} - -fn required_env(name: &str) -> String { - std::env::var(name).unwrap_or_else(|_| panic!("missing required environment variable {name}")) } diff --git a/harmony_zitadel_auth/src/lib.rs b/harmony_zitadel_auth/src/lib.rs index f5cd0e78..bb4f8af0 100644 --- a/harmony_zitadel_auth/src/lib.rs +++ b/harmony_zitadel_auth/src/lib.rs @@ -5,12 +5,7 @@ pub mod jwks; pub mod login; pub mod session; -#[cfg(feature = "axum")] -pub use config::cookie_key_from_env; -pub use config::{ - BASE_URL_ENV, CLIENT_ID_ENV, COOKIE_KEY_ENV, LOGOUT_REDIRECT_URI_ENV, SCOPE_ENV, - TRUSTED_AUDIENCES_ENV, ZITADEL_BASE_ENV, ZitadelAuthConfig, config_from_env, -}; +pub use config::{OperatorCookieKey, ZitadelAuthConfig}; pub use jwks::JwksCache; -- 2.39.5 From 174d0b430427dcdd33744436adf3395fdd7e7932 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 22:20:42 -0400 Subject: [PATCH 06/11] fix(k8s): build Ingress from typed structs; omit ingressClassName when unset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The json!-based renderer set `ingressClassName` to the literal string `"default"` (quotes included) when no class was given — an invalid IngressClass reference, so the Ingress was never claimed/routed. The fleet operator passes None, so it hit exactly that. Rebuild the Ingress from typed k8s_openapi structs. `None` now omits `ingressClassName` so the cluster's default IngressClass claims the resource (per docs/guides/kubernetes-ingress.md); `Some(x)` passes it through unchanged. cert-manager annotations + the tls block are typed too, dropping the serde_json::Value patching and from_value().unwrap(). Tests cover omit-when-none, pass-through-when-set, and backend/path. --- harmony/src/modules/k8s/ingress.rs | 159 ++++++++++++++++++----------- 1 file changed, 98 insertions(+), 61 deletions(-) diff --git a/harmony/src/modules/k8s/ingress.rs b/harmony/src/modules/k8s/ingress.rs index 48ae9d07..7405c1dc 100644 --- a/harmony/src/modules/k8s/ingress.rs +++ b/harmony/src/modules/k8s/ingress.rs @@ -1,10 +1,15 @@ +use std::collections::BTreeMap; + use async_trait::async_trait; use harmony_macros::ingress_path; use harmony_types::id::Id; -use k8s_openapi::api::networking::v1::Ingress; -use log::{debug, trace}; +use k8s_openapi::api::networking::v1::{ + HTTPIngressPath, HTTPIngressRuleValue, Ingress, IngressBackend, IngressRule, + IngressServiceBackend, IngressSpec, IngressTLS, ServiceBackendPort, +}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use log::debug; use serde::Serialize; -use serde_json::json; use crate::{ data::Version, @@ -59,70 +64,71 @@ impl K8sIngressScore { /// Extracted from `create_interpret` so the TLS/annotation shape is /// unit-testable without a topology. fn render_ingress(&self) -> Ingress { - let path = match self.path.clone() { - Some(p) => p, - None => ingress_path!("/"), - }; - - let path_type = match self.path_type.clone() { - Some(p) => p, - None => PathType::Prefix, - }; - - let ingress_class = match self.ingress_class_name.clone() { - Some(ingress_class_name) => ingress_class_name, - None => "\"default\"".to_string(), - }; - - let mut ingress = json!( - { - "metadata": { - "name": self.name.to_string(), - }, - "spec": { - "ingressClassName": ingress_class.as_str(), - "rules": [ - { "host": self.host.to_string(), - "http": { - "paths": [ - { - "path": path, - "pathType": path_type.as_str(), - "backend": { - "service": { - "name": self.backend_service.to_string(), - "port": { - "number": self.port, - } - } - } - } - ] - } - } - ] - } - } - ); + let path = self.path.clone().unwrap_or_else(|| ingress_path!("/")); + let path_type = self.path_type.clone().unwrap_or(PathType::Prefix); + let host = self.host.to_string(); // cert-manager TLS: the annotation issues the cert into // `secretName`, and the `tls` block must name that same Secret // for the Ingress to be portable (a bare `tls` host without a // secretName is rejected by OKD's ingress-to-route translation). - if let Some(issuer) = &self.cluster_issuer { - let secret_name = format!("{}-tls", self.host.to_string().replace('.', "-")); - ingress["metadata"]["annotations"] = json!({ - "cert-manager.io/cluster-issuer": issuer, - "route.openshift.io/termination": "edge", - }); - ingress["spec"]["tls"] = json!([{ - "hosts": [self.host.to_string()], - "secretName": secret_name, - }]); - } + let (annotations, tls) = match &self.cluster_issuer { + Some(issuer) => { + let secret_name = format!("{}-tls", host.replace('.', "-")); + let annotations = BTreeMap::from([ + ("cert-manager.io/cluster-issuer".to_string(), issuer.clone()), + // OKD edge termination — a no-op on non-OpenShift clusters. + ( + "route.openshift.io/termination".to_string(), + "edge".to_string(), + ), + ]); + let tls = vec![IngressTLS { + hosts: Some(vec![host.clone()]), + secret_name: Some(secret_name), + }]; + (Some(annotations), Some(tls)) + } + None => (None, None), + }; + + let ingress = Ingress { + metadata: ObjectMeta { + name: Some(self.name.to_string()), + annotations, + ..Default::default() + }, + spec: Some(IngressSpec { + // Left unset when `None` so the cluster's default + // IngressClass claims the resource — setting it to a + // bogus/empty value instead leaves the Ingress unrouted + // (see docs/guides/kubernetes-ingress.md). + ingress_class_name: self.ingress_class_name.clone(), + rules: Some(vec![IngressRule { + host: Some(host), + http: Some(HTTPIngressRuleValue { + paths: vec![HTTPIngressPath { + path: Some(path), + path_type: path_type.as_str().to_string(), + backend: IngressBackend { + service: Some(IngressServiceBackend { + name: self.backend_service.to_string(), + port: Some(ServiceBackendPort { + name: None, + number: Some(i32::from(self.port)), + }), + }), + resource: None, + }, + }], + }), + }]), + tls, + ..Default::default() + }), + ..Default::default() + }; - trace!("Building ingresss object from Value {ingress:#}"); - let ingress: Ingress = serde_json::from_value(ingress).unwrap(); debug!( "Successfully built Ingress for host {:?}", ingress.metadata.name @@ -231,6 +237,37 @@ mod tests { assert!(ing.spec.unwrap().tls.is_none()); } + #[test] + fn ingress_class_omitted_when_none() { + // Unset (not "default", not "") so the cluster's default + // IngressClass claims the resource. + let ing = score(None).render_ingress(); + assert!(ing.spec.unwrap().ingress_class_name.is_none()); + } + + #[test] + fn ingress_class_passed_through_when_set() { + let mut s = score(None); + s.ingress_class_name = Some("openshift-default".into()); + let ing = s.render_ingress(); + assert_eq!( + ing.spec.unwrap().ingress_class_name.as_deref(), + Some("openshift-default") + ); + } + + #[test] + fn backend_and_path_render() { + let ing = score(None).render_ingress(); + let rule = &ing.spec.unwrap().rules.unwrap()[0]; + assert_eq!(rule.host.as_deref(), Some("fleet-stg.cb1.nationtech.io")); + let p = &rule.http.as_ref().unwrap().paths[0]; + assert_eq!(p.path_type, "Prefix"); + let svc = p.backend.service.as_ref().unwrap(); + assert_eq!(svc.name, "op"); + assert_eq!(svc.port.as_ref().unwrap().number, Some(18080)); + } + #[test] fn cluster_issuer_renders_cert_manager_and_tls() { let ing = score(Some("letsencrypt-prod".into())).render_ingress(); -- 2.39.5 From 2c4fe5c8d63cb7f4edee0f15f973386993cbb19b Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 22:31:20 -0400 Subject: [PATCH 07/11] fix(fleet): grant the operator RBAC on devices/status The Device CRD gained a status subresource (liveness reconciler), but RBAC treats `devices/status` as a resource distinct from `devices`, so the operator's patch_status 403'd. Add a ClusterRole rule granting get/update/patch on `devices/status`, mirroring `deployments/status`. A test locks both status subresources in the role. --- .../src/operator/chart.rs | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/fleet/harmony-fleet-deploy/src/operator/chart.rs b/fleet/harmony-fleet-deploy/src/operator/chart.rs index 7ce91612..c7fb19ef 100644 --- a/fleet/harmony-fleet-deploy/src/operator/chart.rs +++ b/fleet/harmony-fleet-deploy/src/operator/chart.rs @@ -327,7 +327,7 @@ fn cluster_role() -> ClusterRole { // Devices: reconciler server-side-applies + deletes; // aggregator lists + watches. PolicyRule { - api_groups: Some(vec![group]), + api_groups: Some(vec![group.clone()]), resources: Some(vec!["devices".to_string()]), verbs: vec![ "get", "list", "watch", "create", "update", "patch", "delete", @@ -337,6 +337,17 @@ fn cluster_role() -> ClusterRole { .collect(), ..Default::default() }, + // Device liveness: the device-status reconciler patches the + // status subresource — a distinct RBAC resource from `devices`. + PolicyRule { + api_groups: Some(vec![group]), + resources: Some(vec!["devices/status".to_string()]), + verbs: vec!["get", "update", "patch"] + .into_iter() + .map(String::from) + .collect(), + ..Default::default() + }, ]), ..Default::default() } @@ -611,6 +622,24 @@ mod tests { assert!(deployment_yaml.contains(SECRET_NAME)); } + // The CRDs carry status subresources, which RBAC treats as distinct + // resources — a patch on `*/status` is forbidden without an explicit + // grant. Lock both so adding a status subresource can't silently 403. + #[test] + fn cluster_role_grants_status_subresources() { + let role = cluster_role(); + let grants_patch = |resource: &str| { + role.rules.as_ref().unwrap().iter().any(|r| { + r.resources + .as_ref() + .is_some_and(|res| res.iter().any(|x| x == resource)) + && r.verbs.iter().any(|v| v == "patch") + }) + }; + assert!(grants_patch("deployments/status")); + assert!(grants_patch("devices/status")); + } + // The chart bakes these env names at publish time; the operator's // ConfigClient derives them from the struct names at runtime. Lock // them together so a rename can't silently desync the two. -- 2.39.5 From aab5e82119a2b4d998bf8f515ce5467fd35150c9 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 23:03:55 -0400 Subject: [PATCH 08/11] feat: slight improvement of temp version date format for better human readability --- fleet/scripts/dev-deploy-operator.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleet/scripts/dev-deploy-operator.sh b/fleet/scripts/dev-deploy-operator.sh index 44635386..a569d461 100755 --- a/fleet/scripts/dev-deploy-operator.sh +++ b/fleet/scripts/dev-deploy-operator.sh @@ -49,7 +49,7 @@ fi # Unique + semver-valid: helm rejects a non-semver chart version, and a # fresh version forces a clean image pull + rollout every iteration. -VERSION="0.0.0-dev.$(date +%s)" +VERSION="0.0.0-dev.$(date -u +'%Y%m%d%H%M%S')" export DOCKER_BUILDKIT=1 -- 2.39.5 From 75aac243c72a8404b043876ed45b051b50e7f8c1 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 23:06:45 -0400 Subject: [PATCH 09/11] docs: operator dashboard SSO (Zitadel) setup guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step-by-step for wiring the operator dashboard's browser SSO: the Zitadel app settings the code requires (Web app, PKCE/no-secret, redirect + post-logout URIs), each config value mapped to its source (ZitadelAuthConfig + cookie key), how to provide them (staging via FleetDeployConfig/Secrets with hosts derived from base_domain; local via HARMONY_CONFIG_* env), the derived endpoints, and the common-failure gotchas (iss/aud/redirect mismatch, no client secret, localhost dev mode, ≥64-byte cookie key). Grounded in harmony_zitadel_auth's login/jwks code. Registered in SUMMARY and cross-linked from web-auth-security. --- docs/SUMMARY.md | 1 + docs/guides/operator-dashboard-sso.md | 144 ++++++++++++++++++++++++++ docs/guides/web-auth-security.md | 2 + 3 files changed, 147 insertions(+) create mode 100644 docs/guides/operator-dashboard-sso.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 10ae47c4..14798669 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -24,6 +24,7 @@ - [Writing a Topology](./guides/writing-a-topology.md) - [Adding Capabilities](./guides/adding-capabilities.md) - [Web Authentication and CSRF Security](./guides/web-auth-security.md) +- [Operator Dashboard SSO (Zitadel) — setup](./guides/operator-dashboard-sso.md) ## Configuration diff --git a/docs/guides/operator-dashboard-sso.md b/docs/guides/operator-dashboard-sso.md new file mode 100644 index 00000000..6623de5c --- /dev/null +++ b/docs/guides/operator-dashboard-sso.md @@ -0,0 +1,144 @@ +# Operator Dashboard SSO (Zitadel) — setup + +How to configure Zitadel + Harmony so the **fleet operator dashboard** +authenticates users over SSO. This is the *browser* login for the web UI — +distinct from the headless agent/callout auth (JWT-bearer machine keys) +documented in [`fleet-zitadel-faq.md`](./fleet-zitadel-faq.md). The security +rationale behind these controls is [`web-auth-security.md`](./web-auth-security.md). + +The dashboard uses **OIDC Authorization Code + PKCE as a public client** (no +client secret). On callback it validates the ID token and stores it as the +session cookie; every request re-verifies that JWT against Zitadel's JWKS. + +Code: `harmony_zitadel_auth/src/{login,jwks,axum_login_flow,config}.rs`; +the operator wires it in `fleet/harmony-fleet-operator/src/main.rs::serve_dashboard`. + +--- + +## Part A — the Zitadel application + +In the Zitadel console, inside the project that owns the fleet (the same org +as the rest of the fleet setup), create an **Application**: + +| Setting | Value | Why | +|---|---|---| +| Type | **Web** | Server-side app (not a SPA/native). | +| Authentication method | **PKCE** (no client secret) | The code does pure Auth-Code-+-PKCE and sends **no** `client_secret` (`login.rs::exchange_code_for_token`). An app that requires a secret will fail token exchange. | +| Grant types | **Authorization Code** | `response_type=code`, `grant_type=authorization_code`. | +| Redirect URIs | `https://fleet-stg./auth/callback` (+ `http://localhost:18080/auth/callback` for local dev) | Must match `redirect_uri` exactly — it's `{base_url}/auth/callback` and is sent on both the authorize and token calls. | +| Post-logout redirect URIs | `https://fleet-stg./` (+ `http://localhost:18080/` for dev) | `logout_redirect_uri`, sent to `/oidc/v1/end_session`. | +| Development Mode | **On** only if you use an `http://localhost` redirect | Zitadel rejects non-HTTPS redirect URIs unless dev mode is enabled on the app. Leave **off** for the staging HTTPS app. | + +After creating it, copy the **Client ID** — that's `client_id` below. + +**Scopes & audience.** The dashboard requests `openid profile email`. Zitadel +puts the app's own Client ID in the ID token's `aud` claim by default; that's +the minimum `trusted_audiences` entry. If your project asserts extra audiences +(e.g. you added the reserved `urn:zitadel:iam:org:project:id::aud` +scope to also accept the project's other client IDs), list those too. + +> **Authorization gap:** today the dashboard only checks that the user +> authenticated successfully — it does **not** gate on a Zitadel role. Any user +> in the org who can log in gets in. Role-based authz is a known hardening item +> ([`web-auth-security.md`](./web-auth-security.md) §3, +> [ROADMAP/09-sso-config-hardening](../../ROADMAP/09-sso-config-hardening.md)). + +--- + +## Part B — the values Harmony needs + +The operator reads two typed config values via `ConfigClient` +(`harmony_zitadel_auth::{ZitadelAuthConfig, OperatorCookieKey}`): + +| Field | Example | Source | +|---|---|---| +| `zitadel_base` | `https://sso-stg.cb1.nationtech.io` | Your Zitadel issuer URL. **Must equal the `iss` claim exactly** (no trailing slash) — it's used as both the issuer check and the endpoint/JWKS base. | +| `base_url` | `https://fleet-stg.cb1.nationtech.io` | The dashboard's **own** public URL. Drives `redirect_uri = base_url + /auth/callback` and sets `Secure` cookies when it's `https`. | +| `client_id` | `372626218874372917` | The Zitadel app's Client ID (Part A). | +| `scope` | `openid profile email` | Standard OIDC scopes. | +| `trusted_audiences` | `["372626218874372917", …]` | Accepted `aud` values — at minimum the app's `client_id`. | +| `logout_redirect_uri` | `https://fleet-stg.cb1.nationtech.io/` | Where Zitadel sends the browser after logout; must be a registered post-logout URI. | +| `cookie_key_b64` | (64 random bytes, base64) | Session-cookie signing/encryption key. Generate with `openssl rand -base64 64` — it **must decode to ≥64 bytes** or the dashboard refuses to start. | + +--- + +## Part C — providing them + +### Staging (the deploy) + +The deploy **derives** the hosts from `base_domain`, so you only seed the +Zitadel-specific bits. In OpenBao (config namespace `fleet-staging`): + +- `FleetDeployConfig.base_domain` → `cb1.nationtech.io` (default). The deploy then derives: + - `zitadel_base = https://sso-stg.{base_domain}` + - `base_url = https://fleet-stg.{base_domain}` + - `logout_redirect_uri = https://fleet-stg.{base_domain}/` + - `scope = openid profile email` +- `FleetDeployConfig.operator_oidc_client_id` → the app Client ID +- `FleetDeployConfig.operator_trusted_audiences` → the accepted `aud` list +- `FleetDeploySecrets.operator_cookie_key_b64` → `openssl rand -base64 64` + +The deploy bakes the resolved `ZitadelAuthConfig` + cookie key into the operator +Secret as `HARMONY_CONFIG_ZitadelAuthConfig` / `HARMONY_CONFIG_OperatorCookieKey` +(see `fleet/harmony-fleet-deploy/src/operator/{score,chart}.rs`). Then: + +```bash +./fleet/scripts/dev-deploy-operator.sh +``` + +`fleet_staging_install` already generates + seeds a cookie key +(`generate_cookie_key_b64`), so a fresh install needs only the client ID + +audiences. + +### Local dev (`serve-web --mock`) + +`ConfigClient`'s env source keys each struct under `HARMONY_CONFIG_` +(JSON). `fleet/harmony-fleet-operator/dev.sh` sets both — point `base_url` at +`http://localhost:18080` and register that callback in the app (dev mode on): + +```bash +export HARMONY_CONFIG_ZitadelAuthConfig='{"zitadel_base":"https://sso-stg.cb1.nationtech.io","base_url":"http://localhost:18080","client_id":"","scope":"openid profile email","trusted_audiences":[""],"logout_redirect_uri":"http://localhost:18080/"}' +export HARMONY_CONFIG_OperatorCookieKey='{"cookie_key_b64":""}' +``` + +--- + +## Endpoints the dashboard calls + +All derived from `zitadel_base` (so getting that exactly right matters): + +| Purpose | URL | +|---|---| +| Discovery (→ JWKS) | `{zitadel_base}/.well-known/openid-configuration` | +| Authorize | `{zitadel_base}/oauth/v2/authorize` | +| Token | `{zitadel_base}/oauth/v2/token` | +| End session (logout) | `{zitadel_base}/oidc/v1/end_session` | + +--- + +## Gotchas (most failures are one of these) + +- **`iss` mismatch.** `zitadel_base` must equal the issuer in the ID token, + byte-for-byte, no trailing slash. Per-request verification sets the issuer to + `zitadel_base` (`jwks.rs`); a mismatch rejects every session. +- **`aud` mismatch.** If `trusted_audiences` doesn't contain the token's `aud`, + login completes at Zitadel but the callback rejects the token. Start with the + app's own Client ID. +- **Redirect URI mismatch.** The value registered in Zitadel must be exactly + `{base_url}/auth/callback`. A `base_url` with a trailing slash or wrong scheme + breaks it. +- **Client secret set.** The app must use **PKCE** auth (no secret). If Zitadel + issued a secret, token exchange fails — the code never sends one. +- **Localhost without dev mode.** `http://localhost:18080/...` redirect URIs + need Development Mode enabled on the app. +- **Cookie key too short.** `cookie_key_b64` must decode to ≥64 bytes + (`openssl rand -base64 64`); otherwise the dashboard logs + `cookie_key_b64 must decode to at least 64 bytes` and serves nothing + (reconcile keeps running). + +## Verify + +Open `https://fleet-stg./` → redirected to Zitadel → log in → back to the +dashboard. The operator logs `dashboard server exited` only on a config error; +on success there's no such line and `/` renders the SPA. To debug a rejected +token, check the `iss`/`aud`/`exp`/`nonce` against the values above. diff --git a/docs/guides/web-auth-security.md b/docs/guides/web-auth-security.md index 8c52a52e..139189af 100644 --- a/docs/guides/web-auth-security.md +++ b/docs/guides/web-auth-security.md @@ -2,6 +2,8 @@ These guidelines define the baseline for Harmony web frontends and future operator dashboards that use browser-based authentication, cookie sessions, Axum, HTMX, or OIDC providers such as Zitadel. +> Setting up the fleet operator dashboard's SSO concretely (Zitadel app + the exact config values)? See [Operator Dashboard SSO (Zitadel) — setup](./operator-dashboard-sso.md). + ## Goals - Prevent unauthenticated access. -- 2.39.5 From 1c0e9df68289b8ee6cb1f2ee2fa2dcff39d6811f Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 23:08:48 -0400 Subject: [PATCH 10/11] docs: trim operator SSO guide to a quickstart-first page Was ~150 lines with the host-derivation repeated three times and reference tables ahead of the happy path. Rewrite as a 4-step staging quickstart (the main content), with the counterintuitive bits demoted to short "when login fails" + "config reference" sections. ~55 lines. --- docs/guides/operator-dashboard-sso.md | 175 +++++++------------------- 1 file changed, 43 insertions(+), 132 deletions(-) diff --git a/docs/guides/operator-dashboard-sso.md b/docs/guides/operator-dashboard-sso.md index 6623de5c..c22eafb2 100644 --- a/docs/guides/operator-dashboard-sso.md +++ b/docs/guides/operator-dashboard-sso.md @@ -1,144 +1,55 @@ # Operator Dashboard SSO (Zitadel) — setup -How to configure Zitadel + Harmony so the **fleet operator dashboard** -authenticates users over SSO. This is the *browser* login for the web UI — -distinct from the headless agent/callout auth (JWT-bearer machine keys) -documented in [`fleet-zitadel-faq.md`](./fleet-zitadel-faq.md). The security -rationale behind these controls is [`web-auth-security.md`](./web-auth-security.md). +Browser SSO for the fleet operator web UI (OIDC Authorization Code + PKCE, +public client). Distinct from the agent/callout machine auth +([fleet-zitadel-faq](./fleet-zitadel-faq.md)); the security rationale is in +[web-auth-security](./web-auth-security.md). Code: `harmony_zitadel_auth/`. -The dashboard uses **OIDC Authorization Code + PKCE as a public client** (no -client secret). On callback it validates the ID token and stores it as the -session cookie; every request re-verifies that JWT against Zitadel's JWKS. +## Quickstart (staging) -Code: `harmony_zitadel_auth/src/{login,jwks,axum_login_flow,config}.rs`; -the operator wires it in `fleet/harmony-fleet-operator/src/main.rs::serve_dashboard`. +1. **Zitadel app** — create a **Web** application, auth method **PKCE** (no client + secret), redirect URI `https://fleet-stg./auth/callback`, post-logout URI + `https://fleet-stg./`. Copy its **Client ID**. +2. **Seed config** in OpenBao (namespace `fleet-staging`) — the deploy derives every + host from `base_domain`, so you set only: + - `FleetDeployConfig.operator_oidc_client_id` = the Client ID + - `FleetDeployConfig.operator_trusted_audiences` = `[""]` + - `FleetDeploySecrets.operator_cookie_key_b64` = `openssl rand -base64 64` +3. **Deploy**: `./fleet/scripts/dev-deploy-operator.sh` +4. Open `https://fleet-stg./` → Zitadel login → back to the dashboard. ---- +`fleet_staging_install` already generates the cookie key, so a fresh install needs +only the Client ID + audiences. -## Part A — the Zitadel application +## Local dev (`serve-web`) -In the Zitadel console, inside the project that owns the fleet (the same org -as the rest of the fleet setup), create an **Application**: +`fleet/harmony-fleet-operator/dev.sh` sets the same config as two +`HARMONY_CONFIG_` env JSON blobs (ConfigClient's env source). Point +`base_url` at `http://localhost:18080`, register that callback in the app, and turn +on the app's **Development Mode** (Zitadel rejects non-HTTPS redirects otherwise). -| Setting | Value | Why | -|---|---|---| -| Type | **Web** | Server-side app (not a SPA/native). | -| Authentication method | **PKCE** (no client secret) | The code does pure Auth-Code-+-PKCE and sends **no** `client_secret` (`login.rs::exchange_code_for_token`). An app that requires a secret will fail token exchange. | -| Grant types | **Authorization Code** | `response_type=code`, `grant_type=authorization_code`. | -| Redirect URIs | `https://fleet-stg./auth/callback` (+ `http://localhost:18080/auth/callback` for local dev) | Must match `redirect_uri` exactly — it's `{base_url}/auth/callback` and is sent on both the authorize and token calls. | -| Post-logout redirect URIs | `https://fleet-stg./` (+ `http://localhost:18080/` for dev) | `logout_redirect_uri`, sent to `/oidc/v1/end_session`. | -| Development Mode | **On** only if you use an `http://localhost` redirect | Zitadel rejects non-HTTPS redirect URIs unless dev mode is enabled on the app. Leave **off** for the staging HTTPS app. | +## When login fails — check these first -After creating it, copy the **Client ID** — that's `client_id` below. +- **`iss` mismatch** — `zitadel_base` must equal the token issuer byte-for-byte, no + trailing slash. +- **`aud` mismatch** — `trusted_audiences` must contain the token's `aud`; Zitadel + puts the app's Client ID there by default. +- **Client secret** — the app must be PKCE-only; the code never sends a secret. +- **Redirect URI** — must be exactly `{base_url}/auth/callback`. +- **Cookie key** — `cookie_key_b64` must decode to ≥64 bytes, else the dashboard + refuses to start (`cookie_key_b64 must decode to at least 64 bytes`; reconcile + keeps running). -**Scopes & audience.** The dashboard requests `openid profile email`. Zitadel -puts the app's own Client ID in the ID token's `aud` claim by default; that's -the minimum `trusted_audiences` entry. If your project asserts extra audiences -(e.g. you added the reserved `urn:zitadel:iam:org:project:id::aud` -scope to also accept the project's other client IDs), list those too. +## Config reference -> **Authorization gap:** today the dashboard only checks that the user -> authenticated successfully — it does **not** gate on a Zitadel role. Any user -> in the org who can log in gets in. Role-based authz is a known hardening item -> ([`web-auth-security.md`](./web-auth-security.md) §3, -> [ROADMAP/09-sso-config-hardening](../../ROADMAP/09-sso-config-hardening.md)). +The operator reads `ZitadelAuthConfig` + `OperatorCookieKey` via ConfigClient. The +deploy derives `zitadel_base` / `base_url` / `logout_redirect_uri` from `base_domain` +(`https://sso-stg.`, `https://fleet-stg.`, `…/`) and fixes +`scope = openid profile email`; you supply `client_id`, `trusted_audiences`, +`cookie_key_b64`. All endpoints derive from `zitadel_base`: +`/.well-known/openid-configuration`, `/oauth/v2/authorize`, `/oauth/v2/token`, +`/oidc/v1/end_session`. ---- - -## Part B — the values Harmony needs - -The operator reads two typed config values via `ConfigClient` -(`harmony_zitadel_auth::{ZitadelAuthConfig, OperatorCookieKey}`): - -| Field | Example | Source | -|---|---|---| -| `zitadel_base` | `https://sso-stg.cb1.nationtech.io` | Your Zitadel issuer URL. **Must equal the `iss` claim exactly** (no trailing slash) — it's used as both the issuer check and the endpoint/JWKS base. | -| `base_url` | `https://fleet-stg.cb1.nationtech.io` | The dashboard's **own** public URL. Drives `redirect_uri = base_url + /auth/callback` and sets `Secure` cookies when it's `https`. | -| `client_id` | `372626218874372917` | The Zitadel app's Client ID (Part A). | -| `scope` | `openid profile email` | Standard OIDC scopes. | -| `trusted_audiences` | `["372626218874372917", …]` | Accepted `aud` values — at minimum the app's `client_id`. | -| `logout_redirect_uri` | `https://fleet-stg.cb1.nationtech.io/` | Where Zitadel sends the browser after logout; must be a registered post-logout URI. | -| `cookie_key_b64` | (64 random bytes, base64) | Session-cookie signing/encryption key. Generate with `openssl rand -base64 64` — it **must decode to ≥64 bytes** or the dashboard refuses to start. | - ---- - -## Part C — providing them - -### Staging (the deploy) - -The deploy **derives** the hosts from `base_domain`, so you only seed the -Zitadel-specific bits. In OpenBao (config namespace `fleet-staging`): - -- `FleetDeployConfig.base_domain` → `cb1.nationtech.io` (default). The deploy then derives: - - `zitadel_base = https://sso-stg.{base_domain}` - - `base_url = https://fleet-stg.{base_domain}` - - `logout_redirect_uri = https://fleet-stg.{base_domain}/` - - `scope = openid profile email` -- `FleetDeployConfig.operator_oidc_client_id` → the app Client ID -- `FleetDeployConfig.operator_trusted_audiences` → the accepted `aud` list -- `FleetDeploySecrets.operator_cookie_key_b64` → `openssl rand -base64 64` - -The deploy bakes the resolved `ZitadelAuthConfig` + cookie key into the operator -Secret as `HARMONY_CONFIG_ZitadelAuthConfig` / `HARMONY_CONFIG_OperatorCookieKey` -(see `fleet/harmony-fleet-deploy/src/operator/{score,chart}.rs`). Then: - -```bash -./fleet/scripts/dev-deploy-operator.sh -``` - -`fleet_staging_install` already generates + seeds a cookie key -(`generate_cookie_key_b64`), so a fresh install needs only the client ID + -audiences. - -### Local dev (`serve-web --mock`) - -`ConfigClient`'s env source keys each struct under `HARMONY_CONFIG_` -(JSON). `fleet/harmony-fleet-operator/dev.sh` sets both — point `base_url` at -`http://localhost:18080` and register that callback in the app (dev mode on): - -```bash -export HARMONY_CONFIG_ZitadelAuthConfig='{"zitadel_base":"https://sso-stg.cb1.nationtech.io","base_url":"http://localhost:18080","client_id":"","scope":"openid profile email","trusted_audiences":[""],"logout_redirect_uri":"http://localhost:18080/"}' -export HARMONY_CONFIG_OperatorCookieKey='{"cookie_key_b64":""}' -``` - ---- - -## Endpoints the dashboard calls - -All derived from `zitadel_base` (so getting that exactly right matters): - -| Purpose | URL | -|---|---| -| Discovery (→ JWKS) | `{zitadel_base}/.well-known/openid-configuration` | -| Authorize | `{zitadel_base}/oauth/v2/authorize` | -| Token | `{zitadel_base}/oauth/v2/token` | -| End session (logout) | `{zitadel_base}/oidc/v1/end_session` | - ---- - -## Gotchas (most failures are one of these) - -- **`iss` mismatch.** `zitadel_base` must equal the issuer in the ID token, - byte-for-byte, no trailing slash. Per-request verification sets the issuer to - `zitadel_base` (`jwks.rs`); a mismatch rejects every session. -- **`aud` mismatch.** If `trusted_audiences` doesn't contain the token's `aud`, - login completes at Zitadel but the callback rejects the token. Start with the - app's own Client ID. -- **Redirect URI mismatch.** The value registered in Zitadel must be exactly - `{base_url}/auth/callback`. A `base_url` with a trailing slash or wrong scheme - breaks it. -- **Client secret set.** The app must use **PKCE** auth (no secret). If Zitadel - issued a secret, token exchange fails — the code never sends one. -- **Localhost without dev mode.** `http://localhost:18080/...` redirect URIs - need Development Mode enabled on the app. -- **Cookie key too short.** `cookie_key_b64` must decode to ≥64 bytes - (`openssl rand -base64 64`); otherwise the dashboard logs - `cookie_key_b64 must decode to at least 64 bytes` and serves nothing - (reconcile keeps running). - -## Verify - -Open `https://fleet-stg./` → redirected to Zitadel → log in → back to the -dashboard. The operator logs `dashboard server exited` only on a config error; -on success there's no such line and `/` renders the SPA. To debug a rejected -token, check the `iss`/`aud`/`exp`/`nonce` against the values above. +> The dashboard only checks that the user authenticated — no role gate yet +> ([web-auth-security](./web-auth-security.md) §3, +> [ROADMAP/09](../../ROADMAP/09-sso-config-hardening.md)). -- 2.39.5 From 9be4f63636f34fecfb0710b02526acfca67a93fa Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Mon, 1 Jun 2026 23:26:38 -0400 Subject: [PATCH 11/11] fix(fleet-operator): build + embed Tailwind CSS in the container image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The image shipped empty CSS: build.rs shells out to the tailwindcss v4 CLI and silently falls back to an empty bundle when it's absent — which it was in the rust:slim builder, so /static/tailwind.css served nothing. - Dockerfile: install the pinned Tailwind v4 standalone CLI (curl) in the builder and set TAILWIND_REQUIRED=1. - build.rs: when TAILWIND_REQUIRED is set (container/prod), a missing or failing CLI is now a hard build error instead of empty CSS; dev builds keep the soft fallback for the `serve-web --css-from` workflow. The env is a rerun trigger, so the first required build regenerates rather than reusing a cache-mounted empty bundle. Verified: with the CLI on PATH the embedded bundle is ~26 KB; with TAILWIND_REQUIRED=1 and no CLI the build fails as intended. --- fleet/harmony-fleet-operator/Dockerfile | 15 +++++++- fleet/harmony-fleet-operator/build.rs | 48 +++++++++++++------------ 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/fleet/harmony-fleet-operator/Dockerfile b/fleet/harmony-fleet-operator/Dockerfile index 26f1bdec..59cb934b 100644 --- a/fleet/harmony-fleet-operator/Dockerfile +++ b/fleet/harmony-fleet-operator/Dockerfile @@ -22,13 +22,26 @@ FROM docker.io/rust:1.94-slim-bookworm AS builder # pkg-config + libssl-dev cover transitive native-tls/openssl-sys deps -# that surface during link; ca-certificates lets cargo fetch over TLS. +# that surface during link; ca-certificates lets cargo fetch over TLS; +# curl downloads the Tailwind CLI below. RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ ca-certificates \ libssl-dev \ + curl \ && rm -rf /var/lib/apt/lists/* +# Tailwind v4 standalone CLI: build.rs shells out to it to generate the +# embedded CSS bundle. Pinned to match the dev host; override with +# --build-arg TAILWIND_VERSION=... TAILWIND_REQUIRED makes build.rs hard +# fail (not silently ship empty CSS) if the CLI is missing — see build.rs. +ARG TAILWIND_VERSION=v4.3.0 +RUN curl -fsSL -o /usr/local/bin/tailwindcss \ + "https://github.com/tailwindlabs/tailwindcss/releases/download/${TAILWIND_VERSION}/tailwindcss-linux-x64" \ + && chmod +x /usr/local/bin/tailwindcss \ + && tailwindcss --help >/dev/null +ENV TAILWIND_REQUIRED=1 + WORKDIR /app COPY . . diff --git a/fleet/harmony-fleet-operator/build.rs b/fleet/harmony-fleet-operator/build.rs index c004905b..bcac15bd 100644 --- a/fleet/harmony-fleet-operator/build.rs +++ b/fleet/harmony-fleet-operator/build.rs @@ -33,34 +33,38 @@ fn main() { println!("cargo:rerun-if-changed=style/input.css"); println!("cargo:rerun-if-changed=src/frontend"); println!("cargo:rerun-if-changed=src/service"); + // Toggling this re-runs the script, so the container build (which sets + // it) regenerates rather than reusing a cache-mounted empty bundle. + println!("cargo:rerun-if-env-changed=TAILWIND_REQUIRED"); - let status = Command::new("tailwindcss") + // Container/production builds set TAILWIND_REQUIRED: a missing or + // failing CLI is a hard error there (never ship empty CSS). Dev builds + // leave it unset and fall back to empty, serving CSS via + // `serve-web --css-from ` against a `tailwindcss --watch` sidecar. + let required = std::env::var_os("TAILWIND_REQUIRED").is_some(); + let fall_back = |reason: String| { + assert!( + !required, + "{reason}\nTAILWIND_REQUIRED is set: refusing to ship a frontend with empty CSS. \ + Install the v4 standalone CLI: https://github.com/tailwindlabs/tailwindcss/releases" + ); + println!( + "cargo:warning={reason}; embedded CSS will be empty \ + (use `serve-web --css-from ` in dev)." + ); + std::fs::write(&output, b"").unwrap(); + }; + + match Command::new("tailwindcss") .arg("--input") .arg(&input) .arg("--output") .arg(&output) .arg("--minify") - .status(); - - match status { + .status() + { Ok(s) if s.success() => {} - Ok(s) => { - println!( - "cargo:warning=tailwindcss exited with status {s}; embedded CSS will be empty. \ - Install the v4 standalone CLI \ - (https://github.com/tailwindlabs/tailwindcss/releases) for production builds, \ - or use `serve-web --css-from ` against a `tailwindcss --watch` sidecar in dev." - ); - std::fs::write(&output, b"").unwrap(); - } - Err(e) => { - println!( - "cargo:warning=tailwindcss not invocable ({e}); embedded CSS will be empty. \ - Install the v4 standalone CLI \ - (https://github.com/tailwindlabs/tailwindcss/releases) for production builds, \ - or use `serve-web --css-from ` against a `tailwindcss --watch` sidecar in dev." - ); - std::fs::write(&output, b"").unwrap(); - } + Ok(s) => fall_back(format!("tailwindcss exited with status {s}")), + Err(e) => fall_back(format!("tailwindcss not invocable ({e})")), } } -- 2.39.5