harmony/fleet/harmony-fleet-e2e/tests/vm_deploy_lifecycle.rs

//! VM-side smoke test #3 — full deploy lifecycle: create, upgrade,
//! delete a podman deployment on the VM via the KV reconciler.
//!
//! Bundled into one test instead of three so the (very expensive)
//! VM bring-up and one-time podman image pull on the VM are
//! amortized across all three lifecycle assertions. Each phase
//! waits on the agent's authoritative `device-state` KV writes —
//! the operator reads the same signal — and ends with one
//! SSH-level ground-truth check (`podman ps`) so we don't trust
//! the agent's self-report alone.
//!
//! Gating: skipped unless `HARMONY_FLEET_VM_E2E=1`.

use std::time::Duration;

use harmony::modules::podman::{PodmanService, PodmanV0Score};
use harmony::topology::RestartPolicy;
use harmony_fleet_e2e::{AdminKv, PhaseExpectation, VmStackOptions, shared_vm_stack};
use harmony_reconciler_contracts::{DeploymentName, Phase};

const ENV_GATE: &str = "HARMONY_FLEET_VM_E2E";

fn enabled() -> bool {
    matches!(std::env::var(ENV_GATE).as_deref(), Ok("1" | "true"))
}

fn dn(s: &str) -> DeploymentName {
    DeploymentName::try_new(s).expect("test-static valid deployment name")
}

/// Generous on purpose — the first image pull on a TCG aarch64
/// guest can take several minutes for a 25 MB image.
const RUN_BUDGET: Duration = Duration::from_secs(600);
/// Upgrade reuses an image already in the store on the same VM, so
/// the budget is shorter — but still TCG-slow.
const UPGRADE_BUDGET: Duration = Duration::from_secs(180);
/// Delete is local-only (no network), should converge fast.
const DELETE_BUDGET: Duration = Duration::from_secs(60);

#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn vm_agent_drives_full_deploy_lifecycle() -> anyhow::Result<()> {
    if !enabled() {
        eprintln!("skipping {ENV_GATE}-gated VM e2e test (set {ENV_GATE}=1 to run)");
        return Ok(());
    }

    let _ = tracing_subscriber::fmt()
        .with_env_filter(
            tracing_subscriber::EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
        )
        .try_init();

    let stack = shared_vm_stack(VmStackOptions::from_env()).await?;
    stack.print_debug_info();
    stack.wait_until_ready(Duration::from_secs(60)).await?;

    let device = stack.devices.first().expect("at least one VM device");
    let device_id = device.device_id.to_string();
    let deployment = dn("hello-web");
    let admin = AdminKv::connect(&stack.infra.nats_client).await?;

    // ---- phase 1: deploy ----
    tracing::info!(
        device = %device_id,
        deployment = %deployment,
        "phase 1/3 — deploy",
    );
    let v1 = podman_score("nginx:alpine");
    admin.put_podman(&device_id, &deployment, &v1).await?;
    let state = admin
        .wait_for_phase(
            &device_id,
            &deployment,
            PhaseExpectation::running(),
            RUN_BUDGET,
        )
        .await?
        .expect("Running phase implies an existing state entry");
    assert_eq!(state.phase, Phase::Running);
    assert_eq!(state.deployment, deployment);
    assert_eq!(state.device_id.to_string(), device_id);

    // Ground-truth via SSH: the container is actually present.
    let ps = device
        .ssh("sudo -iu fleet-agent podman ps --format '{{.Names}}\\t{{.Image}}'")
        .await?
        .into_successful()
        .map_err(|e| anyhow::anyhow!("sudo -iu fleet-agent podman ps failed: {e}"))?;
    assert!(
        ps.stdout.contains("hello-web-svc"),
        "sudo -iu fleet-agent podman ps must show our service, got:\n{}",
        ps.stdout,
    );

    // ---- phase 2: upgrade ----
    tracing::info!(
        device = %device_id,
        deployment = %deployment,
        "phase 2/3 — upgrade",
    );
    let v2 = podman_score("nginx:stable-alpine");
    admin.put_podman(&device_id, &deployment, &v2).await?;
    // The state-bucket phase may stay `Running` across an in-place
    // recreate (the new container reaches Running quickly). The
    // ground-truth assertion below catches the image change.
    let _ = admin
        .wait_for_phase(
            &device_id,
            &deployment,
            PhaseExpectation::running(),
            UPGRADE_BUDGET,
        )
        .await?;
    // Wait for the image to flip on the device. The reconcile is
    // event-driven on KV put + periodic; we re-poll `podman ps` for
    // a short window so we don't race the recreate.
    let upgraded_at = std::time::Instant::now();
    loop {
        let ps = device
            .ssh("sudo -iu fleet-agent podman ps --format '{{.Names}}\\t{{.Image}}'")
            .await?
            .into_successful()
            .map_err(|e| anyhow::anyhow!("sudo -iu fleet-agent podman ps failed: {e}"))?;
        if ps.stdout.contains("nginx:stable-alpine") && ps.stdout.contains("hello-web-svc") {
            break;
        }
        if upgraded_at.elapsed() > UPGRADE_BUDGET {
            anyhow::bail!(
                "upgrade not visible on device within {UPGRADE_BUDGET:?}, last podman ps:\n{}",
                ps.stdout,
            );
        }
        tokio::time::sleep(Duration::from_secs(2)).await;
    }

    // ---- phase 3: delete ----
    tracing::info!(
        device = %device_id,
        deployment = %deployment,
        "phase 3/3 — delete",
    );
    admin.delete_desired_state(&device_id, &deployment).await?;
    // The agent's tombstone path drops the device-state entry once
    // the container is removed.
    let final_state = admin
        .wait_for_phase(
            &device_id,
            &deployment,
            PhaseExpectation::Absent,
            DELETE_BUDGET,
        )
        .await?;
    assert!(
        final_state.is_none(),
        "device-state entry should be absent after delete, got {final_state:?}",
    );

    // Ground-truth: container is actually gone.
    let ps_final = device
        .ssh("sudo -iu fleet-agent podman ps --all --format '{{.Names}}'")
        .await?
        .into_successful()
        .map_err(|e| anyhow::anyhow!("final podman ps failed: {e}"))?;
    assert!(
        !ps_final.stdout.contains("hello-web-svc"),
        "container hello-web-svc still present after delete:\n{}",
        ps_final.stdout,
    );

    Ok(())
}

fn podman_score(image_tag: &str) -> PodmanV0Score {
    PodmanV0Score {
        services: vec![PodmanService {
            name: "hello-web-svc".to_string(),
            // Pin upstream to docker.io/library so the VM doesn't
            // depend on whatever podman default registry list the
            // image ships with. nginx:alpine is multi-arch and the
            // smallest battle-tested long-running ARM image.
            image: format!("docker.io/library/{image_tag}"),
            ports: vec![],
            env: vec![],
            volumes: vec![],
            restart_policy: RestartPolicy::default(),
        }],
        init_containers: vec![],
    }
}