From 8e6e1fa1bc17ca98497cd325df55218711991756 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 20 May 2026 21:49:51 -0400 Subject: [PATCH 1/3] feat: fleet e2e x86 vm support --- fleet/harmony-fleet-e2e/README.md | 1 + fleet/harmony-fleet-e2e/src/vm/agent_build.rs | 117 +++++++++++++----- fleet/harmony-fleet-e2e/src/vm/mod.rs | 9 +- fleet/harmony-fleet-e2e/src/vm/stack.rs | 33 ++++- .../tests/vm_deploy_lifecycle.rs | 2 +- fleet/harmony-fleet-e2e/tests/vm_isolation.rs | 2 +- fleet/harmony-fleet-e2e/tests/vm_ping.rs | 2 +- 7 files changed, 127 insertions(+), 39 deletions(-) diff --git a/fleet/harmony-fleet-e2e/README.md b/fleet/harmony-fleet-e2e/README.md index 7b17027a..d3e596d9 100644 --- a/fleet/harmony-fleet-e2e/README.md +++ b/fleet/harmony-fleet-e2e/README.md @@ -45,6 +45,7 @@ Every test in this crate is gated so `cargo test --workspace` stays cheap. | `HARMONY_FLEET_E2E=1` | Enable the Pod-target test (`ping.rs`). Needs k3d + podman on PATH. | | `HARMONY_FLEET_VM_E2E=1` | Enable the VM-target tests (`vm_*`). Needs libvirt + qemu + aarch64 cross-toolchain. | | `FLEET_E2E_KEEP=1` | Leave the k8s namespace + libvirt VM in place on test exit (debug). | +| `FLEET_E2E_VM_ARCH=x86_64` | Boot an x86_64 KVM guest instead of an aarch64 TCG guest. Default `aarch64` (production target). x86 runs ~3-4× faster — useful for iteration. | | `RUST_LOG=...` | Standard tracing filter; default is `info`. | ## Running tests diff --git a/fleet/harmony-fleet-e2e/src/vm/agent_build.rs b/fleet/harmony-fleet-e2e/src/vm/agent_build.rs index dcef2254..a075dbaf 100644 --- a/fleet/harmony-fleet-e2e/src/vm/agent_build.rs +++ b/fleet/harmony-fleet-e2e/src/vm/agent_build.rs @@ -1,26 +1,36 @@ -//! Cross-build the fleet agent binary for an aarch64 Linux guest. +//! Build the fleet agent binary for a target VM architecture. //! -//! Mirrors `fleet/scripts/smoke-a3-arm.sh` phase 2 in Rust: ensure -//! the `aarch64-unknown-linux-gnu` rustup target is installed, then -//! `cargo build --release --target aarch64-unknown-linux-gnu -p -//! harmony-fleet-agent`. Returns the path to the resulting binary -//! so `FleetDeviceSetupScore` can upload it. +//! Two paths: //! -//! Prereq the harness intentionally does **not** install for the -//! operator: a working aarch64 GNU cross-toolchain on the host -//! (Arch: `aarch64-linux-gnu-gcc`; Debian/Ubuntu: -//! `gcc-aarch64-linux-gnu`). Without it, `cargo build` fails with -//! a link error we surface verbatim. +//! - **aarch64** — cross-build via `cargo build --release --target +//! aarch64-unknown-linux-gnu -p harmony-fleet-agent`. Requires the +//! `aarch64-unknown-linux-gnu` rustup target *and* a GNU cross-linker +//! on the host (Arch: `aarch64-linux-gnu-gcc`; Debian/Ubuntu: +//! `gcc-aarch64-linux-gnu`). Mirrors `fleet/scripts/smoke-a3-arm.sh` +//! phase 2. +//! - **x86_64** — native host build via `cargo build --release -p +//! harmony-fleet-agent`. No `--target`, no rustup add, no +//! cross-linker. The same binary the Pod-target path consumes, +//! reused here for the faster-but-non-Pi VM smoke. +//! +//! The aarch64 path matches the production Raspberry Pi target byte +//! for byte; the x86_64 path is for fast-iteration tests where the +//! arch difference doesn't matter. use std::path::{Path, PathBuf}; use std::process::Stdio; +use harmony::topology::VmArchitecture; use thiserror::Error; use tokio::process::Command; -/// Rust target triple used for the on-VM agent. aarch64-Linux-GNU -/// matches the Ubuntu 24.04 cloud image the harness boots. -pub const AGENT_TARGET_TRIPLE: &str = "aarch64-unknown-linux-gnu"; +/// Rust target triple for the aarch64 cross-build. +pub const AGENT_AARCH64_TARGET_TRIPLE: &str = "aarch64-unknown-linux-gnu"; + +/// Back-compat re-export — older code references the aarch64 triple +/// under this name. +#[deprecated(note = "use AGENT_AARCH64_TARGET_TRIPLE")] +pub const AGENT_TARGET_TRIPLE: &str = AGENT_AARCH64_TARGET_TRIPLE; #[derive(Debug, Error)] pub enum AgentBuildError { @@ -30,24 +40,36 @@ pub enum AgentBuildError { #[source] source: std::io::Error, }, - #[error("`rustup target add {AGENT_TARGET_TRIPLE}` failed (rc={rc}): {stderr}")] + #[error("`rustup target add {AGENT_AARCH64_TARGET_TRIPLE}` failed (rc={rc}): {stderr}")] RustupAdd { rc: i32, stderr: String }, #[error( - "`cargo build` for harmony-fleet-agent (target {AGENT_TARGET_TRIPLE}) failed (rc={rc}). \ - The most common cause is a missing aarch64 GNU cross-linker — install one (Arch: \ - `aarch64-linux-gnu-gcc`; Debian/Ubuntu: `gcc-aarch64-linux-gnu`) and re-run." + "`cargo build` for harmony-fleet-agent (target {target}) failed (rc={rc}). \ + For the aarch64 cross-build, the most common cause is a missing GNU cross-linker \ + (Arch: `aarch64-linux-gnu-gcc`; Debian/Ubuntu: `gcc-aarch64-linux-gnu`)." )] - CargoBuild { rc: i32 }, + CargoBuild { target: String, rc: i32 }, #[error("agent binary not produced at expected path {path}")] MissingArtifact { path: String }, } -/// Build (or rebuild, cargo-cached) the aarch64 agent binary and -/// return its on-disk path. Cheap on warm cache; first run is the -/// expensive one. +/// Build the fleet agent for the requested guest architecture and +/// return its on-disk path. Routes to the arch-specific builder. +pub async fn build_agent_for( + arch: VmArchitecture, + workspace_root: &Path, +) -> Result { + match arch { + VmArchitecture::Aarch64 => build_agent_for_aarch64(workspace_root).await, + VmArchitecture::X86_64 => build_agent_for_x86_64(workspace_root).await, + } +} + +/// Cross-build for aarch64-Linux-GNU. The on-disk path lives under +/// `target/aarch64-unknown-linux-gnu/release/` so it doesn't collide +/// with the host's native build. pub async fn build_agent_for_aarch64(workspace_root: &Path) -> Result { let rustup = Command::new("rustup") - .args(["target", "add", AGENT_TARGET_TRIPLE]) + .args(["target", "add", AGENT_AARCH64_TARGET_TRIPLE]) .stdout(Stdio::null()) .stderr(Stdio::piped()) .output() @@ -64,22 +86,19 @@ pub async fn build_agent_for_aarch64(workspace_root: &Path) -> Result Result Result { + tracing::info!("cargo build --release -p harmony-fleet-agent (native x86_64)"); + let build = Command::new("cargo") + .args(["build", "--release", "-p", "harmony-fleet-agent"]) + .current_dir(workspace_root) + .stderr(Stdio::inherit()) + .stdout(Stdio::inherit()) + .status() + .await + .map_err(|source| AgentBuildError::Spawn { + cmd: "cargo".to_string(), + source, + })?; + if !build.success() { + return Err(AgentBuildError::CargoBuild { + target: "x86_64-unknown-linux-gnu (native)".to_string(), rc: build.code().unwrap_or(-1), }); } let bin = workspace_root .join("target") - .join(AGENT_TARGET_TRIPLE) .join("release") .join("harmony-fleet-agent"); if !bin.exists() { diff --git a/fleet/harmony-fleet-e2e/src/vm/mod.rs b/fleet/harmony-fleet-e2e/src/vm/mod.rs index efe2646e..28fd03df 100644 --- a/fleet/harmony-fleet-e2e/src/vm/mod.rs +++ b/fleet/harmony-fleet-e2e/src/vm/mod.rs @@ -22,10 +22,13 @@ pub mod device; pub mod network; pub mod stack; -pub use agent_build::{AGENT_TARGET_TRIPLE, AgentBuildError, build_agent_for_aarch64}; +pub use agent_build::{ + AGENT_AARCH64_TARGET_TRIPLE, AgentBuildError, build_agent_for, build_agent_for_aarch64, + build_agent_for_x86_64, +}; pub use device::{VmDevice, VmDeviceError, VmDeviceOptions}; pub use network::{NetworkLookupError, libvirt_default_gateway_ip}; pub use stack::{ - LIBVIRT_NETWORK, LIBVIRT_URI, VM_NAME_PREFIX, VmBringUpError, VmReadyError, VmStack, - VmStackOptions, shared_vm_stack, + ENV_VM_ARCH, LIBVIRT_NETWORK, LIBVIRT_URI, VM_NAME_PREFIX, VmBringUpError, VmReadyError, + VmStack, VmStackOptions, shared_vm_stack, }; diff --git a/fleet/harmony-fleet-e2e/src/vm/stack.rs b/fleet/harmony-fleet-e2e/src/vm/stack.rs index e801eda6..d4f98dda 100644 --- a/fleet/harmony-fleet-e2e/src/vm/stack.rs +++ b/fleet/harmony-fleet-e2e/src/vm/stack.rs @@ -27,7 +27,7 @@ use tokio::sync::OnceCell; use uuid::Uuid; use crate::stack::{BringUpError, NATS_NODE_PORT, Stack, StackOptions, shared_stack}; -use crate::vm::agent_build::{AgentBuildError, build_agent_for_aarch64}; +use crate::vm::agent_build::{AgentBuildError, build_agent_for}; use crate::vm::device::{VmDevice, VmDeviceError, VmDeviceOptions}; use crate::vm::network::{NetworkLookupError, libvirt_default_gateway_ip}; @@ -82,6 +82,31 @@ impl Default for VmStackOptions { } } +/// Env var that lets tests pick a guest arch at runtime without a +/// recompile. Accepts `aarch64`/`arm64` and `x86_64`/`x86-64`. +/// Unset = defaults to aarch64 (production target). +pub const ENV_VM_ARCH: &str = "FLEET_E2E_VM_ARCH"; + +impl VmStackOptions { + /// Read env overrides (today: just [`ENV_VM_ARCH`]) and apply + /// them on top of [`Default`]. Returns the canonical "what the + /// test asked for" struct, so tests don't have to re-implement + /// env parsing. + pub fn from_env() -> Self { + let mut opts = Self::default(); + if let Ok(raw) = std::env::var(ENV_VM_ARCH) { + match raw.to_ascii_lowercase().as_str() { + "aarch64" | "arm64" => opts.arch = VmArchitecture::Aarch64, + "x86_64" | "x86-64" | "x86" | "amd64" => opts.arch = VmArchitecture::X86_64, + other => panic!( + "{ENV_VM_ARCH}={other:?} not recognized — use aarch64 or x86_64", + ), + } + } + opts + } +} + #[derive(Debug, Error)] pub enum VmBringUpError { #[error("infra bring-up: {0}")] @@ -154,9 +179,11 @@ impl VmStack { // place. let infra = shared_stack(StackOptions::infra_only()).await?; - // 2. Cross-build the aarch64 agent binary once for all VMs. + // 2. Build the agent binary for the requested guest arch. + // aarch64 cross-builds; x86_64 takes the host's native + // output. let workspace_root = workspace_root_from_env(); - let agent_binary = build_agent_for_aarch64(&workspace_root).await?; + let agent_binary = build_agent_for(opts.arch, &workspace_root).await?; // 3. Discover the libvirt gateway IP so the VM can reach // the host's NATS NodePort. diff --git a/fleet/harmony-fleet-e2e/tests/vm_deploy_lifecycle.rs b/fleet/harmony-fleet-e2e/tests/vm_deploy_lifecycle.rs index f6b7cb36..c27fd552 100644 --- a/fleet/harmony-fleet-e2e/tests/vm_deploy_lifecycle.rs +++ b/fleet/harmony-fleet-e2e/tests/vm_deploy_lifecycle.rs @@ -51,7 +51,7 @@ async fn vm_agent_drives_full_deploy_lifecycle() -> anyhow::Result<()> { ) .try_init(); - let stack = shared_vm_stack(VmStackOptions::default()).await?; + let stack = shared_vm_stack(VmStackOptions::from_env()).await?; stack.print_debug_info(); stack.wait_until_ready(Duration::from_secs(60)).await?; diff --git a/fleet/harmony-fleet-e2e/tests/vm_isolation.rs b/fleet/harmony-fleet-e2e/tests/vm_isolation.rs index 4c3502bd..88154fa1 100644 --- a/fleet/harmony-fleet-e2e/tests/vm_isolation.rs +++ b/fleet/harmony-fleet-e2e/tests/vm_isolation.rs @@ -50,7 +50,7 @@ async fn agent_ignores_other_devices_keys() -> anyhow::Result<()> { ) .try_init(); - let stack = shared_vm_stack(VmStackOptions::default()).await?; + let stack = shared_vm_stack(VmStackOptions::from_env()).await?; stack.print_debug_info(); stack.wait_until_ready(Duration::from_secs(60)).await?; diff --git a/fleet/harmony-fleet-e2e/tests/vm_ping.rs b/fleet/harmony-fleet-e2e/tests/vm_ping.rs index 70c941b9..5a204341 100644 --- a/fleet/harmony-fleet-e2e/tests/vm_ping.rs +++ b/fleet/harmony-fleet-e2e/tests/vm_ping.rs @@ -37,7 +37,7 @@ async fn agent_on_vm_replies_to_ping() -> anyhow::Result<()> { ) .try_init(); - let stack = shared_vm_stack(VmStackOptions::default()).await?; + let stack = shared_vm_stack(VmStackOptions::from_env()).await?; stack.print_debug_info(); // `FleetDeviceSetupScore` returns when the systemd unit is -- 2.39.5 From ba685baddbc2f4027572e574d3ed5ac2a7cd313e Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 20 May 2026 22:47:52 -0400 Subject: [PATCH 2/3] doc: fleet e2e x86 arch support --- fleet/harmony-fleet-e2e/README.md | 77 ++++++++++++++----- fleet/harmony-fleet-e2e/src/vm/agent_build.rs | 5 -- fleet/harmony-fleet-e2e/src/vm/stack.rs | 6 +- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/fleet/harmony-fleet-e2e/README.md b/fleet/harmony-fleet-e2e/README.md index d3e596d9..c2243f86 100644 --- a/fleet/harmony-fleet-e2e/README.md +++ b/fleet/harmony-fleet-e2e/README.md @@ -23,7 +23,7 @@ src/ └── vm/ # VM-target harness ├── stack.rs # VmStack = infra Stack + Vec ├── device.rs # one libvirt VM: ProvisionVmScore + FleetDeviceSetupScore - ├── agent_build.rs # cross-build the agent for aarch64-unknown-linux-gnu + ├── agent_build.rs # build the agent for the requested guest arch (aarch64 cross / x86_64 native) └── network.rs # libvirt default-network gateway IP discovery ``` @@ -32,9 +32,9 @@ Tests in `tests/` map 1:1 to scenarios: | File | What it asserts | Cost | |---|---|---| | `ping.rs` | Pod agent replies to `Verb::Ping` over NATS | ~30 s (k3d + image build) | -| `vm_ping.rs` | VM agent replies to `Verb::Ping` over NATS | aarch64 VM bring-up | -| `vm_isolation.rs` | VM agent does NOT react to another device's KV key | shared VM | -| `vm_deploy_lifecycle.rs` | deploy → upgrade → delete podman deployment, KV phases + `podman ps` ground truth | shared VM + image pulls | +| `vm_ping.rs` | VM agent replies to `Verb::Ping` over NATS | ~75 s (x86 KVM) / ~7 min (aarch64 TCG) | +| `vm_isolation.rs` | VM agent does NOT react to another device's KV key | ~75 s (x86 KVM) / ~8 min (aarch64 TCG) | +| `vm_deploy_lifecycle.rs` | deploy → upgrade → delete podman deployment, KV phases + `podman ps` ground truth | ~90 s (x86 KVM) / ~7-8 min (aarch64 TCG) | ## Env gates @@ -43,7 +43,7 @@ Every test in this crate is gated so `cargo test --workspace` stays cheap. | Var | Purpose | |---|---| | `HARMONY_FLEET_E2E=1` | Enable the Pod-target test (`ping.rs`). Needs k3d + podman on PATH. | -| `HARMONY_FLEET_VM_E2E=1` | Enable the VM-target tests (`vm_*`). Needs libvirt + qemu + aarch64 cross-toolchain. | +| `HARMONY_FLEET_VM_E2E=1` | Enable the VM-target tests (`vm_*`). Needs libvirt + qemu (+ aarch64 cross-toolchain when running the default arch). | | `FLEET_E2E_KEEP=1` | Leave the k8s namespace + libvirt VM in place on test exit (debug). | | `FLEET_E2E_VM_ARCH=x86_64` | Boot an x86_64 KVM guest instead of an aarch64 TCG guest. Default `aarch64` (production target). x86 runs ~3-4× faster — useful for iteration. | | `RUST_LOG=...` | Standard tracing filter; default is `info`. | @@ -56,25 +56,69 @@ Every test in this crate is gated so `cargo test --workspace` stays cheap. HARMONY_FLEET_E2E=1 cargo test -p harmony-fleet-e2e --test ping -- --nocapture ``` -### VM-target (expensive, real podman + aarch64 boot) +### VM-target — pick aarch64 (prod parity) or x86_64 (fast iteration) + +The same three tests run against either guest arch — flip +`FLEET_E2E_VM_ARCH`. Defaults to `aarch64` (Raspberry Pi target). + +| Path | Guest CPU | Wall-clock for `vm_ping` (warm caches) | Use when | +|---|---|---|---| +| `FLEET_E2E_VM_ARCH=x86_64` | native KVM | **~75 s** | dev iteration loop | +| (default, `aarch64`) | qemu TCG emulation | **~7 min** | pre-push / CI / arch-drift catch | + +CI **must** run aarch64 — even though x86 covers the logic, a new +crate dep with a broken aarch64 build or a podman call that segfaults +under TCG will only surface on the real target. ```bash -# One scenario at a time. Each test binary brings up its own VM -# (cargo runs each integration test file as a separate binary, so the -# per-binary `shared_vm_stack` OnceCell does not amortize across binaries). -HARMONY_FLEET_VM_E2E=1 RUST_LOG=info cargo test -p harmony-fleet-e2e --test vm_ping -- --nocapture -HARMONY_FLEET_VM_E2E=1 RUST_LOG=info cargo test -p harmony-fleet-e2e --test vm_isolation -- --nocapture -HARMONY_FLEET_VM_E2E=1 RUST_LOG=info cargo test -p harmony-fleet-e2e --test vm_deploy_lifecycle -- --nocapture +# ---- dev iteration loop (x86_64 KVM, ~3× faster end-to-end) ---- +HARMONY_FLEET_VM_E2E=1 FLEET_E2E_VM_ARCH=x86_64 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_ping -- --nocapture +HARMONY_FLEET_VM_E2E=1 FLEET_E2E_VM_ARCH=x86_64 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_isolation -- --nocapture +HARMONY_FLEET_VM_E2E=1 FLEET_E2E_VM_ARCH=x86_64 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_deploy_lifecycle -- --nocapture -# All three sequentially: -HARMONY_FLEET_VM_E2E=1 RUST_LOG=info cargo test -p harmony-fleet-e2e \ +# ---- pre-push / CI (aarch64 — production target) ---- +HARMONY_FLEET_VM_E2E=1 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_ping -- --nocapture +HARMONY_FLEET_VM_E2E=1 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_isolation -- --nocapture +HARMONY_FLEET_VM_E2E=1 RUST_LOG=info \ + cargo test -p harmony-fleet-e2e --test vm_deploy_lifecycle -- --nocapture + +# ---- all three sequentially (each is a separate binary → its own VM bring-up) ---- +HARMONY_FLEET_VM_E2E=1 FLEET_E2E_VM_ARCH=x86_64 RUST_LOG=info cargo test -p harmony-fleet-e2e \ --test vm_ping --test vm_isolation --test vm_deploy_lifecycle -- --nocapture --test-threads=1 -# Everything in the crate at once (skips disabled, runs enabled): +# ---- everything in the crate at once (pod + vm, gates honored per-test) ---- HARMONY_FLEET_E2E=1 HARMONY_FLEET_VM_E2E=1 RUST_LOG=info \ cargo test -p harmony-fleet-e2e -- --nocapture --test-threads=1 ``` +### Wall-clock breakdown (measured on this host) + +`vm_ping` from cold libvirt + cold cargo cache (one-time pain) to a +green test: + +| Step | aarch64 TCG | x86_64 KVM | Speedup | +|---|---|---|---| +| Agent build (cold) | 85 s (cross) | 72 s (native) | 1.2× | +| qemu start → DHCP | 48 s | 9 s | 5.3× | +| sshd accepts | 9 s | <1 s | ≥10× | +| Ansible Python detect | 15 s | 1 s | 15× | +| `apt install podman + systemd-container` | **261 s** | **23 s** | **11.3×** | +| FleetDeviceSetup steps 3-7 + restart | ~50 s | ~4 s | ~12× | +| `wait_until_ready` ping retry | ~2 s | <1 s | 2× | +| **Total test future (`finished in …s`)** | **440 s** | **149 s** | **2.95×** | + +The single biggest swing is `apt install podman` inside the guest: +4 min 21 s on TCG vs 23 s on KVM. The whole-test 2.95× speedup is +because cold cargo cross-build and cargo native build are comparable +(~80 s either way) — the in-guest work is where the x86 path +collapses. **Warm-cache iteration is closer to 6× because the cargo +build vanishes.** + ### Debugging a failed bring-up ```bash @@ -139,6 +183,3 @@ bring-up. `FleetNatsScore::user_pass` mode. The Zitadel-JWT path is exercised by `examples/fleet_e2e_demo` (currently `#[ignore]`'d pending a CI runner with full bring-up capacity). -- **x86_64 VM bring-up.** Locked to aarch64 because that's the - production target. An x86_64 fast-path can be added by widening - `VmStackOptions::arch`; out of scope today. diff --git a/fleet/harmony-fleet-e2e/src/vm/agent_build.rs b/fleet/harmony-fleet-e2e/src/vm/agent_build.rs index a075dbaf..cd2b1bda 100644 --- a/fleet/harmony-fleet-e2e/src/vm/agent_build.rs +++ b/fleet/harmony-fleet-e2e/src/vm/agent_build.rs @@ -27,11 +27,6 @@ use tokio::process::Command; /// Rust target triple for the aarch64 cross-build. pub const AGENT_AARCH64_TARGET_TRIPLE: &str = "aarch64-unknown-linux-gnu"; -/// Back-compat re-export — older code references the aarch64 triple -/// under this name. -#[deprecated(note = "use AGENT_AARCH64_TARGET_TRIPLE")] -pub const AGENT_TARGET_TRIPLE: &str = AGENT_AARCH64_TARGET_TRIPLE; - #[derive(Debug, Error)] pub enum AgentBuildError { #[error("spawn `{cmd}`: {source}")] diff --git a/fleet/harmony-fleet-e2e/src/vm/stack.rs b/fleet/harmony-fleet-e2e/src/vm/stack.rs index d4f98dda..c93e8938 100644 --- a/fleet/harmony-fleet-e2e/src/vm/stack.rs +++ b/fleet/harmony-fleet-e2e/src/vm/stack.rs @@ -98,9 +98,7 @@ impl VmStackOptions { match raw.to_ascii_lowercase().as_str() { "aarch64" | "arm64" => opts.arch = VmArchitecture::Aarch64, "x86_64" | "x86-64" | "x86" | "amd64" => opts.arch = VmArchitecture::X86_64, - other => panic!( - "{ENV_VM_ARCH}={other:?} not recognized — use aarch64 or x86_64", - ), + other => panic!("{ENV_VM_ARCH}={other:?} not recognized — use aarch64 or x86_64"), } } opts @@ -111,7 +109,7 @@ impl VmStackOptions { pub enum VmBringUpError { #[error("infra bring-up: {0}")] Infra(#[from] BringUpError), - #[error("aarch64 agent cross-build: {0}")] + #[error("agent build: {0}")] AgentBuild(#[from] AgentBuildError), #[error("libvirt gateway IP discovery: {0}")] GatewayIp(#[from] NetworkLookupError), -- 2.39.5 From 433a66dac247d3d2606db262246fd20e57154d04 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 22 May 2026 12:39:43 -0400 Subject: [PATCH 3/3] feat: fleet e2e x86 support --- ROADMAP/fleet_platform/pre_merge_checklist.md | 216 ++++++++---------- fleet/README.md | 6 + fleet/harmony-fleet-deploy/src/main.rs | 2 + fleet/harmony-fleet-deploy/src/nats.rs | 6 + 4 files changed, 114 insertions(+), 116 deletions(-) diff --git a/ROADMAP/fleet_platform/pre_merge_checklist.md b/ROADMAP/fleet_platform/pre_merge_checklist.md index 74bfd03c..1282c439 100644 --- a/ROADMAP/fleet_platform/pre_merge_checklist.md +++ b/ROADMAP/fleet_platform/pre_merge_checklist.md @@ -29,26 +29,33 @@ why the negative path is intentionally untested (inquire has no stdin mock; covering it would need a `Config` type with a manual non-prompting `InteractiveParseObj` impl — separate refactor). -### 1.2 — Manual end-to-end verification per fleet component +### 1.2 — End-to-end verification per fleet component -The user-stated bar: every component of the fleet stack deploys -reliably manually. Not yet a single automated suite. Run through -this matrix on a developer box with libvirt + k3d + podman -available. Mark date + initials when each row passes. +Rows the `harmony-fleet-e2e` crate now covers as automated tests: + +| Component | How to run | Status | +|---|---|---| +| Pod-target agent + NATS in k3d | `HARMONY_FLEET_E2E=1 cargo test -p harmony-fleet-e2e --test ping` | ✓ automated | +| ARM VM bring-up + agent (aarch64 cloud image, AAVMF firmware) | `HARMONY_FLEET_VM_E2E=1 cargo test -p harmony-fleet-e2e --test vm_ping` | ✓ automated | +| x86 VM bring-up + agent (KVM, fast path) | `HARMONY_FLEET_VM_E2E=1 FLEET_E2E_VM_ARCH=x86_64 cargo test … --test vm_ping` | ✓ automated | +| Device-setup over SSH (FleetDeviceSetupScore) | Exercised by every `vm_*` test bring-up | ✓ automated | +| Ping (operator → agent over NATS request/reply) | Both `ping` (Pod) and `vm_ping` (VM) | ✓ automated | +| Agent KV isolation (own filter only) | `vm_isolation` | ✓ automated | +| Podman deployment lifecycle (deploy → upgrade → delete) | `vm_deploy_lifecycle` (+ `podman ps` ground-truth via SSH) | ✓ automated | + +Verified at least once each on the dev host (aarch64 ~7 min, +x86_64 ~2.5 min); see `fleet/harmony-fleet-e2e/README.md` for +copy-paste commands and the wall-clock breakdown. + +Rows still **manual** (no Rust automation yet — verify by hand +before merge and record date + initials): | Component | How to deploy | What "works" looks like | Owner | Last verified | |---|---|---|---|---| -| x86 VM (cloud-init Ubuntu) | `cargo run -p example_fleet_vm_setup` | `virsh list` shows running VM with SSH key trust | | | -| ARM VM (aarch64 + AAVMF firmware) | `cargo run -p example_fleet_vm_setup --features aarch64` (or `fleet/scripts/smoke-a3-arm.sh`) | aarch64 VM boots, fleet-agent comes up on it | | | | Zitadel (full setup) | `cargo run -p example_fleet_staging_install -- --base-domain <…>` | Zitadel admin UI reachable, persisted admin password set, IAM PAT secret created | | | | NATS + auth callout | `cargo run -p example_fleet_auth_callout` (deploy phase) | NATS pod running on k3d; callout pod healthy; JWKS fetch logs visible | | | | Operator | `cargo run -p example_fleet_server_install` | Operator pod up, Deployment CRD registered, NATS KV buckets created | | | -| Agent on x86 VM | follow `examples/fleet_e2e_demo/RUNBOOK.md` | Agent connects to NATS, publishes DeviceInfo to KV | | | -| Agent on ARM VM | same + arm64 target | same | | | | Enrollment via Zitadel SSO | `cargo run -p example-fleet-sso-login` + `fleet-device-enroll --device-id …` | Device JWT minted, machine user provisioned, agent connects with bearer-token JWT | | | -| Device-setup over SSH (FleetDeviceSetupScore) | from `examples/fleet_e2e_demo::apply_setup` flow | agent binary installed, systemd unit enabled, agent running | | | -| Ping (operator → agent over NATS request/reply) | `HARMONY_FLEET_E2E=1 cargo test -p harmony-fleet-e2e --test ping` | green test, ping round-trip | | | -| Podman deployment | apply a `Deployment` CRD with `PodmanV0Score` payload, watch agent reconcile | `podman ps` on the device shows the requested container | | | Outputs of each manual run go into a follow-up issue / PR description, not committed here — this matrix is the index, not @@ -64,45 +71,48 @@ For each item below, the question is: **does the code on this branch honor the principle?** - **P1. Deploy with Scores, not handrolled manifests.** - - `fleet/harmony-fleet-e2e/src/stack.rs`: already cleaned in - the ADR-023 refactor. Re-confirm no `k8s_openapi::api::*` - structs survive in test/example code. - - `fleet/harmony-fleet-deploy/src/agent.rs`: builds - `Deployment` / `ConfigMap` / `Service` manually inside - `interpret`. **Technically** within ADR-023's letter (it's - inside a Score's interpret body) but is the right - abstraction to compose `K8sResourceScore` instead? - *Flagged for review.* + - `fleet/harmony-fleet-e2e/src/stack.rs` + `vm/*` confirmed + handroll-free: only `*Score` types are composed; the only + `k8s_openapi` use is the readiness-poll `Deployment` get + (cluster query, not a manifest build). + - `fleet/harmony-fleet-deploy/src/agent.rs` still builds + `Deployment` / `ConfigMap` manually inside `interpret`. ADR-023 + letter is honored (manifests are inside a Score's interpret + body, not in test/CLI code), so accepted for this branch. A + future cleanup could compose `K8sResourceScore` instead — + track in a follow-up issue, not a blocker. - **P2. E2E uses the same Scores as production.** - - `harmony-fleet-e2e` is the test of this. Confirm `stack.rs` - composes the same Scores as `example_fleet_server_install`. + - ✓ verified by both Pod (`stack.rs`) and VM (`vm/*.rs`) + harnesses — they compose `FleetNatsScore` + `FleetAgentScore` + + `ProvisionVmScore` + `FleetDeviceSetupScore` exactly as + `example_fleet_server_install` / `example_fleet_vm_setup` do. - **P3. One Score per deployable component.** - - `harmony/src/modules/fleet/setup_score.rs` is 1049 lines and - composes Zitadel + NATS + callout + operator. ADR-023 says - "composition is the user-facing primitive; don't build - monolithic deploy-everything Scores." Confirm this file is a - composition of primitives, not a megascore that bypasses - them. - - **The 3 open code review comments still apply** (see §3.1). + - `harmony/src/modules/fleet/setup_score.rs` (1049 lines) is a + *device-side composition* (podman + user + linger + config + + systemd unit), not a multi-service deploy. Acceptable under + P3; the file is on the deferred move-to-`*-deploy` list (§1.7 + ADR-024 scope). - **P4. Deploy returns only after smoke-test success.** - - This is *not* enforced today — see §3.2. Track as known - debt, not a merge blocker (ADR-023 left it open). + - Not enforced framework-wide; see §3.2. The e2e harness now + has `VmStack::wait_until_ready` (ping retry until subscribed) + as a per-test stand-in. Track as known debt, not a blocker. - **P5. Deploy logic lives in a `*-deploy` crate.** - - Confirm: `harmony-fleet-deploy` is the canonical home. The - `harmony/src/modules/fleet/` directory should shrink, not - grow, in follow-ups. ADR-024 proposes pulling more out. + - ✓ `harmony-fleet-deploy` is the canonical home. New + `companion/` module added there. The `harmony/src/modules/ + fleet/` directory should still shrink — see §1.7. - **P6. Topologies compile-time, selected at runtime.** - - No `Box` plugin loaders introduced. Confirm - with `rg 'Box.ping" "" ``` +Or if you don't want to install the nats binary : + +``` +alias natsbox='podman run --network=host --rm docker.io/natsio/nats-box:latest nats --server nats://localhost:30423 --user admin --password e2e-admin' +``` + You should see something like `{"device_id":"vm-device-00-","agent_version":"0.1.0","uptime_s":12}`. ### Cleaning up diff --git a/fleet/harmony-fleet-deploy/src/main.rs b/fleet/harmony-fleet-deploy/src/main.rs index 62d40af2..fda170f0 100644 --- a/fleet/harmony-fleet-deploy/src/main.rs +++ b/fleet/harmony-fleet-deploy/src/main.rs @@ -29,6 +29,8 @@ use harmony_fleet_deploy::{FleetAgentScore, FleetNatsScore, FleetOperatorScore, name = "harmony-fleet-deploy", about = "Deploy the harmony fleet stack to a Kubernetes cluster" )] +// TODO all env vars should be prefixed with HARMONY and k8s namespaces should begin with +// `harmony-` also struct CliConfig { /// Namespace every component lands in. Production override comes /// from `FLEET_NAMESPACE`. diff --git a/fleet/harmony-fleet-deploy/src/nats.rs b/fleet/harmony-fleet-deploy/src/nats.rs index 4e5347bb..3f2d2cbd 100644 --- a/fleet/harmony-fleet-deploy/src/nats.rs +++ b/fleet/harmony-fleet-deploy/src/nats.rs @@ -92,6 +92,12 @@ impl FleetNatsScore { /// callout. The defaults are deliberately weak (`admin/e2e-admin`, /// `device/e2e-device`); override with [`with_user_pass`]. pub fn user_pass(namespace: impl Into, node_port: u16) -> Self { + // TODO this should be behind a feature flag, this code should not exist in the + // production build + // + // Actually to make it simpler I would hardcode the dev credentials in the e2e crate + // and not the deployment crate. The e2e crate can easily use the score and pass it the + // proper config or use `.with_user_pass(...)` Self { namespace: namespace.into(), release_name: "fleet-nats".to_string(), -- 2.39.5