harmony/fleet/harmony-fleet-e2e/tests/vm_isolation.rs

//! VM-side smoke test #2 — the agent does NOT react to desired
//! state addressed to a different device.
//!
//! Asserts the agent's KV watch filter is scoped to its own
//! `device_id`. The harness uses the [`AgentObservation`]
//! companion (from `harmony-fleet-deploy`) to derive the same KV
//! filter the agent uses, then writes a desired-state entry under
//! a **foreign** device id and verifies that:
//!   1. the foreign key is outside the agent's filter prefix, and
//!   2. no `device-state` entry shows up for that key within a
//!      bounded window.
//!
//! Negative-assertion timeouts are inherently a tradeoff against
//! TCG slowness; 30 s is a generous upper bound — the agent
//! normally reacts to a KV put within seconds.
//!
//! Gating: skipped unless `HARMONY_FLEET_VM_E2E=1`.

use std::collections::BTreeMap;
use std::time::Duration;

use harmony::modules::podman::{PodmanService, PodmanV0Score};
use harmony::topology::RestartPolicy;
use harmony_fleet_auth::{AgentConfig, AgentSection, CredentialsSection, NatsSection};
use harmony_fleet_deploy::companion::AgentObservation;
use harmony_fleet_e2e::{AdminKv, PhaseExpectation, VmStackOptions, shared_vm_stack};
use harmony_reconciler_contracts::{DeploymentName, Id, desired_state_key};

const ENV_GATE: &str = "HARMONY_FLEET_VM_E2E";

fn enabled() -> bool {
    matches!(std::env::var(ENV_GATE).as_deref(), Ok("1" | "true"))
}

fn dn(s: &str) -> DeploymentName {
    DeploymentName::try_new(s).expect("test-static valid deployment name")
}

#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn agent_ignores_other_devices_keys() -> anyhow::Result<()> {
    if !enabled() {
        eprintln!("skipping {ENV_GATE}-gated VM e2e test (set {ENV_GATE}=1 to run)");
        return Ok(());
    }

    let _ = tracing_subscriber::fmt()
        .with_env_filter(
            tracing_subscriber::EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
        )
        .try_init();

    let stack = shared_vm_stack(VmStackOptions::from_env()).await?;
    stack.print_debug_info();
    stack.wait_until_ready(Duration::from_secs(60)).await?;

    let device = stack.devices.first().expect("at least one VM device");
    let own_device_id = device.device_id.to_string();
    let foreign_device_id = format!("{own_device_id}-bystander");

    // Build the companion view the agent itself derives from its
    // config. The companion lets us assert at *test time* that the
    // agent's effective watch filter excludes the foreign key —
    // catching any regression in the watch wiring before the
    // negative-assertion timeout below runs.
    let cfg = synthesize_agent_config(&own_device_id);
    let observation = AgentObservation::of_config(&cfg);
    assert!(observation.is_watching(), "VM agent must be watching");

    let deployment = dn("intruder");
    let foreign_key = desired_state_key(&foreign_device_id, &deployment);

    // Static cross-check: the foreign key must NOT match the
    // agent's filter prefix. If this fires we've broken the
    // multi-device isolation contract at the wire-format level —
    // no need to wait for runtime to confirm.
    if let AgentObservation::Watching { filter, .. } = &observation {
        let prefix = filter.trim_end_matches('>');
        assert!(
            !foreign_key.starts_with(prefix),
            "foreign key {foreign_key} unexpectedly inside own filter {filter}",
        );
    }

    let admin = AdminKv::connect(&stack.infra.nats_client).await?;
    let score = PodmanV0Score {
        services: vec![PodmanService {
            name: "intruder-svc".to_string(),
            image: "docker.io/library/nginx:alpine".to_string(),
            ports: vec![],
            env: vec![],
            volumes: vec![],
            restart_policy: RestartPolicy::default(),
        }],
        init_containers: vec![],
    };
    admin.put_podman_at_key(&foreign_key, &score).await?;

    // Negative wait: device-state for the foreign key must stay
    // absent for the whole window. We poll the *foreign* key
    // path (not the agent's own) — the agent should never have
    // written it.
    let result = admin
        .wait_for_phase(
            &foreign_device_id,
            &deployment,
            PhaseExpectation::running_or_failed(),
            Duration::from_secs(30),
        )
        .await;
    assert!(
        result.is_err(),
        "agent unexpectedly reacted to a foreign device key: state={:?}",
        result.ok().flatten(),
    );

    Ok(())
}

/// Mirror the agent config the device was deployed with. We rebuild
/// it locally so the test owns the inputs to `AgentObservation` —
/// the deploy path doesn't (yet) hand back the typed `AgentConfig`
/// it placed on the VM. When that contract gets surfaced from
/// `VmDevice`, this helper goes away.
fn synthesize_agent_config(device_id: &str) -> AgentConfig {
    AgentConfig {
        agent: AgentSection {
            device_id: Id::from(device_id.to_string()),
            runtime_enabled: true,
        },
        nats: NatsSection {
            urls: vec!["nats://placeholder:4222".to_string()],
        },
        credentials: CredentialsSection::TomlShared {
            nats_user: "device".to_string(),
            nats_pass: "e2e-device".to_string(),
        },
        labels: BTreeMap::new(),
    }
}