diff --git a/Cargo.lock b/Cargo.lock index c5cd3715..ef44076a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4735,15 +4735,17 @@ version = "0.1.0" dependencies = [ "anyhow", "async-nats", + "async-trait", "clap", "futures-util", + "harmony", + "harmony-k8s", "harmony-reconciler-contracts", "k8s-openapi", "kube", "schemars 0.8.22", "serde", "serde_json", - "serde_yaml", "thiserror 2.0.18", "tokio", "tracing", diff --git a/ROADMAP/12-code-review-april-2026.md b/ROADMAP/12-code-review-april-2026.md new file mode 100644 index 00000000..7986aa1e --- /dev/null +++ b/ROADMAP/12-code-review-april-2026.md @@ -0,0 +1,116 @@ +# Phase 12: Code Review Items (April 2026) + +Items identified during the `feat/opnsense-codegen` PR review that require further design or cross-cutting work. + +## Completed in this PR + +- **1.1** Remove panic in `haproxy_service_to_harmony` — returns `None` with `warn!()` instead of panicking on invalid bind address +- **1.2** Use `MacAddress` type from `harmony_types` in KVM module — replaced `String` MAC fields in `VmInterface`, `NetworkRef`, `DhcpHost`, and `set_interface_link` +- **1.3** Compare both firewalls in `FirewallPairTopology::list_static_mappings` — warns on mismatch between primary and backup +- **1.4** Remove no-op default for `LoadBalancer::ensure_wan_access` — now a required trait method +- **2.1** Remove `wan_firewall_ports` from `LoadBalancerScore` — callers handle WAN access separately +- **2.2** Add timeout to OKD bootstrap wait — 90min default, configurable via `HARMONY_OKD_BOOTSTRAP_TIMEOUT_MINUTES` + +## Tasks (deferred) + +### 12.1 Phased topology: LinuxHostTopology → KvmHostTopology + +**Priority**: HIGH +**Status**: Not started +**Related**: Phase 6 (KVM E2E tests) + +The `examples/opnsense_vm_integration/setup-libvirt.sh` shell script should be a Score using a phased topology approach. A `LinuxHostTopology` would be "promoted" to a `KvmHostTopology` after KVM packages are installed and libvirtd is running. + +Key design challenges: +- Type-safe phase transitions (how does a topology gain new capabilities at runtime?) +- Package installation as a Score (distro-agnostic or trait-based) +- Service management (systemd enable/start) as a Score primitive + +This is a major architectural feature that enables the full bare-metal-to-VM pipeline without shell scripts. + +### 12.2 KvmHost validated type with compile-time macro + +**Priority**: MEDIUM +**Status**: Not started +**Related**: 12.1 + +`KvmConnectionUri::RemoteSsh { host: String, username: String }` should become a validated `KvmHost` type with: +- A `kvm_host!("root@hypervisor1")` macro for compile-time validation +- Proper SSH URI parsing and validation +- Integration with the phased topology (12.1) + +### 12.3 Unified directory module + +**Priority**: LOW +**Status**: Not started +**Related**: Phase 9 (SSO + Config Hardening) + +Currently three different directory patterns exist: +- `HARMONY_DATA_DIR` in `harmony/src/domain/config/mod.rs` (lazy_static, `BaseDirs`) +- `harmony_config` uses `ProjectDirs::from("io", "NationTech", "Harmony")` +- `harmony_secret` uses `BaseDirs::data_dir().join("harmony")` +- `openbao/setup.rs` has its own `keys_dir()` function + +Unify into a single `harmony_dirs` module providing: `data_dir()`, `cache_dir()`, `secrets_dir()`, `keys_dir(namespace)`. + +### 12.4 OpenBao unseal key storage — bootstrap secret management + +**Priority**: MEDIUM +**Status**: Research needed +**Related**: Phase 9 (SSO + Config Hardening), task 9.8 (auto-unseal) + +The chicken-and-egg problem: OpenBao needs to be initialized before it can be used as a secret store, but its unseal keys need to be stored somewhere. Current approach stores them as a local JSON file with 0600 permissions. + +Industry solutions to evaluate: +- Upstream OpenBao/Vault storing downstream seal keys (transit auto-unseal) +- HSM-backed auto-unseal (cloud KMS or on-prem HSM) +- TPM-based local encryption +- Shamir-split recovery with multiple administrators +- TOTP-based vault (mentioned in review) + +No perfect solution exists. This requires threat modeling specific to the decentralized micro-datacenter use case. + +### 12.5 Use `vaultrs` crate for type-safe OpenBao provisioning + +**Priority**: MEDIUM +**Status**: Not started +**Related**: Phase 9 + +Replace `kubectl exec bao ...` shell commands in `openbao/setup.rs` with typed `vaultrs` API calls. The `vaultrs` 0.7.4 crate (already a dependency in `harmony_secret`) provides full coverage: + +| Current shell command | vaultrs equivalent | +|---|---| +| `bao operator init` | `vaultrs::sys::start_initialization()` | +| `bao operator unseal` | `vaultrs::sys::unseal()` | +| `bao secrets enable kv-v2` | `vaultrs::sys::mount::enable()` | +| `bao auth enable userpass` | `vaultrs::sys::auth::enable()` | +| `bao policy write` | `vaultrs::sys::policy::set()` | +| `bao write auth/userpass/users/...` | `vaultrs::auth::userpass::user::set()` | +| `bao auth enable jwt` | `vaultrs::sys::auth::enable()` | +| JWT config + role | `vaultrs::auth::oidc::config::set()` + `role::set()` | + +**Prerequisite**: Requires port-forward or ingress to OpenBao (currently uses `kubectl exec` into the pod). Consider adding a `K8sPortForward` utility to `harmony-k8s`. + +### 12.6 Topology proliferation — opinionated topologies leaking into narrow use cases + +**Priority**: MEDIUM +**Status**: Not started +**Related**: 12.1 (phased topology), `feat/install-reconcile-operator-score` + +`K8sAnywhereTopology` and `HAClusterTopology` have accumulated opinions — cert-manager install, tenant manager setup, helm probes, TLS passthrough, SSO wiring — that make them unfit for narrow, ad-hoc Score execution. Calling `ensure_ready()` on `K8sAnywhereTopology` to apply a single CRD installs a full product stack as a side effect; that's the opposite of what "make me ready" should mean. + +Concrete example: `iot/iot-operator-v0/src/install.rs` needed a topology that satisfies `K8sclient` for a single `K8sResourceScore::` apply. `K8sAnywhereTopology` was wrong (too heavy); `HAClusterTopology` was wrong (bare-metal). Work-around: a 30-line inline `InstallTopology` that wraps a pre-built `K8sClient` and has a noop `ensure_ready`. That file flags the architectural smell in its doc comment and points back to this entry. + +If every narrow Score ends up vendoring its own ad-hoc topology, we get exactly the proliferation this entry is meant to prevent. + +**Design direction (to be refined, not prescribed):** + +- A **minimal ad-hoc topology** in harmony — `K8sBareTopology` or similar — that carries a `K8sClient` and implements `K8sclient` + noop `ensure_ready`. One screen of code. Consumed by any Score that just needs to apply a typed resource against an existing cluster. +- Existing opinionated topologies (`K8sAnywhereTopology`) stay, but grow a clear doctrine: `ensure_ready` is for *their* product setup, callers who don't need that product use the bare topology. +- Longer-term: unbundle the product-setup logic from `K8sAnywhereTopology::ensure_ready` into discrete Scores the product compositions explicitly run — so the distinction between "I'm installing a cluster" and "I'm using a cluster" is a composition choice, not a topology choice. + +**What "good" looks like:** + +- Adding a new ad-hoc Score against k8s doesn't require inventing a new topology. +- `K8sAnywhereTopology` stops being the default reach and starts being a deliberate product choice. +- Test: can we delete the inline `InstallTopology` in `iot/iot-operator-v0/src/install.rs` by replacing it with a one-liner `K8sBareTopology::from_env()`? That's the smoke test for "we fixed the proliferation." diff --git a/iot/iot-operator-v0/Cargo.toml b/iot/iot-operator-v0/Cargo.toml index 0ebafed7..bf140170 100644 --- a/iot/iot-operator-v0/Cargo.toml +++ b/iot/iot-operator-v0/Cargo.toml @@ -5,13 +5,15 @@ edition = "2024" rust-version = "1.85" [dependencies] +harmony = { path = "../../harmony" } +harmony-k8s = { path = "../../harmony-k8s" } harmony-reconciler-contracts = { path = "../../harmony-reconciler-contracts" } +async-trait.workspace = true kube = { workspace = true, features = ["runtime", "derive"] } k8s-openapi.workspace = true async-nats = { workspace = true } serde.workspace = true serde_json.workspace = true -serde_yaml.workspace = true schemars = "0.8.22" tokio.workspace = true tracing = { workspace = true } diff --git a/iot/iot-operator-v0/deploy/crd.yaml b/iot/iot-operator-v0/deploy/crd.yaml deleted file mode 100644 index c713cef1..00000000 --- a/iot/iot-operator-v0/deploy/crd.yaml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: deployments.iot.nationtech.io -spec: - group: iot.nationtech.io - names: - categories: [] - kind: Deployment - plural: deployments - shortNames: - - iotdep - singular: deployment - scope: Namespaced - versions: - - additionalPrinterColumns: [] - name: v1alpha1 - schema: - openAPIV3Schema: - description: Auto-generated derived type for DeploymentSpec via `CustomResource` - properties: - spec: - properties: - rollout: - properties: - strategy: - enum: - - Immediate - type: string - required: - - strategy - type: object - score: - properties: - data: - x-kubernetes-preserve-unknown-fields: true - type: - minLength: 1 - type: string - required: - - data - - type - type: object - x-kubernetes-validations: - - message: score.type must be a valid Rust identifier matching the struct name of the score variant (e.g. PodmanV0) - rule: self.type.matches('^[A-Za-z_][A-Za-z0-9_]*$') - targetDevices: - items: - type: string - type: array - required: - - rollout - - score - - targetDevices - type: object - status: - nullable: true - properties: - observedScoreString: - nullable: true - type: string - type: object - required: - - spec - title: Deployment - type: object - served: true - storage: true - subresources: - status: {} - diff --git a/iot/iot-operator-v0/deploy/operator.yaml b/iot/iot-operator-v0/deploy/operator.yaml deleted file mode 100644 index aae6f6d3..00000000 --- a/iot/iot-operator-v0/deploy/operator.yaml +++ /dev/null @@ -1,75 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: iot-system ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: iot-operator - namespace: iot-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: iot-operator -rules: - - apiGroups: ["iot.nationtech.io"] - resources: ["deployments"] - verbs: ["get", "list", "watch", "patch", "update"] - - apiGroups: ["iot.nationtech.io"] - resources: ["deployments/status"] - verbs: ["get", "patch", "update"] - - apiGroups: ["iot.nationtech.io"] - resources: ["deployments/finalizers"] - verbs: ["update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: iot-operator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: iot-operator -subjects: - - kind: ServiceAccount - name: iot-operator - namespace: iot-system ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: iot-operator - namespace: iot-system - labels: - app.kubernetes.io/name: iot-operator -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: iot-operator - template: - metadata: - labels: - app.kubernetes.io/name: iot-operator - spec: - serviceAccountName: iot-operator - containers: - - name: operator - image: ghcr.io/nationtech/iot-operator-v0:latest - imagePullPolicy: IfNotPresent - env: - - name: NATS_URL - value: nats://nats.iot-system.svc.cluster.local:4222 - - name: KV_BUCKET - value: desired-state - - name: RUST_LOG - value: info - resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 500m - memory: 256Mi diff --git a/iot/iot-operator-v0/src/install.rs b/iot/iot-operator-v0/src/install.rs new file mode 100644 index 00000000..48bead63 --- /dev/null +++ b/iot/iot-operator-v0/src/install.rs @@ -0,0 +1,96 @@ +//! Install the operator's CRD into a target Kubernetes cluster +//! via a harmony Score — no yaml generation, no kubectl shell-out. +//! +//! The Score side is just [`K8sResourceScore`] over +//! [`Deployment::crd()`]; what this module owns is a thin +//! [`InstallTopology`] that satisfies `K8sclient` by loading the +//! current `KUBECONFIG` directly. We don't use +//! [`K8sAnywhereTopology`] because its `ensure_ready` does a lot of +//! product-level setup (cert-manager, tenant manager, helm probes) +//! that isn't appropriate for a narrow "apply a CRD" action. + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use harmony::inventory::Inventory; +use harmony::modules::k8s::resource::K8sResourceScore; +use harmony::score::Score; +use harmony::topology::{K8sclient, PreparationOutcome, Topology}; +use harmony_k8s::K8sClient; +use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition; +use kube::CustomResourceExt; + +use crate::crd::Deployment; + +/// Topology that only knows how to hand out a pre-built `K8sClient`. +/// Used by [`install_crds`] so the Score machinery has something +/// that satisfies `K8sclient` without dragging in the full +/// `K8sAnywhereTopology` bootstrap. +/// +/// # Architectural smell — do not copy this pattern without reading the roadmap +/// +/// Vendoring an ad-hoc `Topology` impl in a module that just wants to +/// apply a CRD is a symptom of a bigger problem: the existing +/// opinionated topologies (`K8sAnywhereTopology`, `HAClusterTopology`) +/// have accumulated product-level side effects in their `ensure_ready` +/// — cert-manager install, tenant manager setup, helm probes — that +/// make them unfit for narrow actions. The correct long-term fix is a +/// minimal reusable `K8sBareTopology` in harmony that carries a +/// `K8sClient` and exposes `K8sclient` with a noop `ensure_ready`, so +/// every narrow Score isn't tempted to vendor its own copy. +/// +/// See `ROADMAP/12-code-review-april-2026.md` §12.6 "Topology +/// proliferation". The explicit smoke test for "that roadmap item is +/// done" is: this file can delete `InstallTopology` and replace +/// `topology` construction with a one-liner against the shared type. +struct InstallTopology { + client: Arc, +} + +#[async_trait] +impl Topology for InstallTopology { + fn name(&self) -> &str { + "iot-operator-install" + } + async fn ensure_ready( + &self, + ) -> Result { + Ok(PreparationOutcome::Noop) + } +} + +#[async_trait] +impl K8sclient for InstallTopology { + async fn k8s_client(&self) -> Result, String> { + Ok(self.client.clone()) + } +} + +/// Apply the operator's CRDs to whatever cluster `KUBECONFIG` points +/// at. Returns once the apply call completes — does **not** wait for +/// the apiserver to mark the CRD `Established`; the caller does that +/// (e.g. with `kubectl wait --for=condition=Established`) if it +/// cares. +pub async fn install_crds() -> Result<()> { + let kube_client = kube::Client::try_default() + .await + .context("building kube client from KUBECONFIG")?; + let topology = InstallTopology { + client: Arc::new(K8sClient::new(kube_client)), + }; + let inventory = Inventory::empty(); + + let crd: CustomResourceDefinition = Deployment::crd(); + let score = K8sResourceScore::::single(crd, None); + + let interpret = Score::::create_interpret(&score); + let outcome = interpret + .execute(&inventory, &topology) + .await + .map_err(|e| anyhow::anyhow!("install CRD: {e}")) + .context("executing K8sResourceScore for Deployment CRD")?; + + tracing::info!(?outcome, "CRD installed"); + Ok(()) +} diff --git a/iot/iot-operator-v0/src/main.rs b/iot/iot-operator-v0/src/main.rs index 230a639f..f62983e5 100644 --- a/iot/iot-operator-v0/src/main.rs +++ b/iot/iot-operator-v0/src/main.rs @@ -1,13 +1,12 @@ mod controller; mod crd; +mod install; use anyhow::Result; use async_nats::jetstream; use clap::{Parser, Subcommand}; use harmony_reconciler_contracts::BUCKET_DESIRED_STATE; -use kube::{Client, CustomResourceExt}; - -use crate::crd::Deployment; +use kube::Client; #[derive(Parser)] #[command( @@ -39,8 +38,9 @@ struct Cli { enum Command { /// Run the controller (default when no subcommand is given). Run, - /// Print the Deployment CRD as YAML. - GenCrd, + /// Apply the operator's CRD to the cluster `KUBECONFIG` points + /// at. Uses harmony's typed k8s client — no yaml, no kubectl. + Install, } #[tokio::main] @@ -51,10 +51,7 @@ async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command.unwrap_or(Command::Run) { - Command::GenCrd => { - println!("{}", serde_yaml::to_string(&Deployment::crd())?); - Ok(()) - } + Command::Install => install::install_crds().await, Command::Run => run(&cli.nats_url, &cli.kv_bucket).await, } } diff --git a/iot/scripts/smoke-a1.sh b/iot/scripts/smoke-a1.sh index 9b5edbfe..5b8d60f8 100755 --- a/iot/scripts/smoke-a1.sh +++ b/iot/scripts/smoke-a1.sh @@ -130,8 +130,8 @@ KUBECONFIG_FILE="$(mktemp -t iot-smoke-kubeconfig.XXXXXX)" "$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE" export KUBECONFIG="$KUBECONFIG_FILE" -log "generate + apply CRD" -( cd "$OPERATOR_DIR" && cargo run -q -- gen-crd ) | kubectl apply -f - >/dev/null +log "install CRD via operator's install subcommand (typed Rust — no yaml, no kubectl apply)" +( cd "$OPERATOR_DIR" && cargo run -q -- install ) >/dev/null kubectl wait --for=condition=Established "crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null kubectl get ns "$DEPLOY_NS" >/dev/null 2>&1 || kubectl create namespace "$DEPLOY_NS" >/dev/null