harmony/examples/fleet_device_enroll/src/main.rs

//! Per-device enrollment driver — runs `FleetDeviceSetupScore` with
//! the new `FleetDeviceAuth::ZitadelEnroll` variant. Two workflows
//! land on the same code path:
//!
//! - **Dev-on-device**: developer runs this on a Pi they have a
//!   keyboard / display attached to. They target their own Pi via
//!   `--target ssh://<user>@127.0.0.1` (sshd is enabled in the
//!   factory image so this works out of the box). The score opens
//!   the local browser to Zitadel SSO, the dev signs in with their
//!   personal account (must hold the admin role), the score mints
//!   a per-device user + key, drops the keyfile + config in place,
//!   and brings the agent up.
//!
//! - **Production-via-SSH**: operator runs this from a workstation,
//!   targets each device over SSH (`--target ssh://pi@10.0.0.42`).
//!   Browser opens once on the workstation; for v0 the resulting
//!   token is held in memory only — re-running for the next device
//!   re-prompts. Token caching is on the roadmap.
//!
//! `--vm-rehearsal` boots an aarch64 KVM VM and enrolls it through
//! the same path, so we can dry-run the whole flow without a Pi.

use std::path::PathBuf;

use anyhow::{Context, Result};
use clap::Parser;
use harmony::inventory::Inventory;
use harmony::modules::fleet::{
    AdminAuth, FleetDeviceAuth, FleetDeviceSetupConfig, FleetDeviceSetupScore,
    ensure_fleet_ssh_keypair,
};
use harmony::modules::linux::{LinuxHostTopology, LinuxLocalhostTopology, SshCredentials};
use harmony_types::id::Id;

// VM-rehearsal-only imports. Hidden behind a feature so `cargo build
// --no-default-features` (the device-side / aarch64 cross-compile)
// doesn't pull in libvirt — `libvirt-dev` doesn't link against arm64
// targets on most distros.
#[cfg(feature = "vm-rehearsal")]
use harmony::modules::fleet::{ProvisionVmScore, check_fleet_smoke_preflight_for_arch};
#[cfg(feature = "vm-rehearsal")]
use harmony::modules::kvm::KvmVirtualMachineHost;
#[cfg(feature = "vm-rehearsal")]
use harmony::modules::kvm::config::init_executor;
#[cfg(feature = "vm-rehearsal")]
use harmony::topology::{VirtualMachineSpec, VmArchitecture, VmFirstBootConfig};

#[derive(Parser, Debug)]
#[command(
    name = "fleet_device_enroll",
    about = "Enroll a device into the fleet by minting its Zitadel \
             credentials inline (browser SSO or pre-acquired token)"
)]
struct Cli {
    // ---- target ----------------------------------------------------------
    /// Where to apply the score.
    ///
    /// - **Omitted** → run on the same machine the binary is invoked
    ///   on (no SSH, no keypair). Ansible's `-c local` connection
    ///   does the work; sudo still goes through your normal
    ///   credentials.
    /// - **`ssh://user@host`** → drive the score against a remote
    ///   device over SSH using the harmony fleet SSH key.
    ///
    /// Ignored when `--vm-rehearsal` is set (the rehearsal targets
    /// the freshly-booted VM).
    #[arg(long)]
    target: Option<String>,

    /// Spin up a fresh aarch64 libvirt VM and enroll it. Pulls the
    /// stock Ubuntu cloud image, attaches to the libvirt `default`
    /// network, waits for SSH, then runs the setup score against it.
    /// Requires the `vm-rehearsal` feature (enabled by default on
    /// host builds, disabled on device-side aarch64 builds).
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long)]
    vm_rehearsal: bool,

    /// Boot a Pi-equivalent aarch64 VM (Debian trixie generic-cloud
    /// image — the same distribution base as Raspberry Pi OS, since
    /// Pi OS itself is locked to Pi hardware and won't boot in
    /// generic KVM) and **exit**. Prints the SSH connection details
    /// so you can connect manually and run `fleet_device_enroll`
    /// against the booted VM as a separate command. Useful for
    /// dev-on-device rehearsal: launch once, then iterate with the
    /// enrollment binary against the running VM. Requires the
    /// `vm-rehearsal` feature.
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long)]
    launch_pi_vm: bool,

    // ---- Zitadel + NATS endpoints ----------------------------------------
    /// Zitadel issuer URL — what the agent will use as its OIDC
    /// issuer and what the score talks to during enrollment.
    /// Required for enrollment; ignored with `--launch-pi-vm`.
    #[arg(long)]
    issuer_url: Option<String>,

    /// Zitadel project ID (the project's numeric id). Becomes the
    /// agent's `audience` for JWT-bearer mint requests, and tags the
    /// machine user so the auth callout's `aud` check passes.
    #[arg(long)]
    audience: Option<String>,

    /// Project name (human-readable) the device's machine user
    /// belongs to. Must already exist — created by the staging
    /// install's `ZitadelSetupScore`.
    #[arg(long, default_value = "fleet")]
    project_name: String,

    /// NATS URL the agent should connect to.
    #[arg(long)]
    nats_url: Option<String>,

    // ---- device identity -------------------------------------------------
    /// Device id baked into the agent's TOML, the Zitadel machine
    /// username (`device-<device_id>`), and the Kubernetes Device CR
    /// name on the operator side. **Required.**
    ///
    /// Must be a valid RFC1123 DNS label / subdomain since the
    /// operator builds Kubernetes resource names from it. The
    /// validator in this binary rejects anything else upfront so
    /// enrollment can't produce a Zitadel machine user that the
    /// operator will later choke on with `metadata.name: Invalid value`.
    ///
    /// Allowed: lowercase alphanumerics + `-`, must start and end with
    /// an alphanumeric, max 63 chars per segment. Segments separated
    /// by `.` are accepted (full RFC1123 subdomain) but `-` is the
    /// usual choice.
    ///
    /// Examples that pass: `pi-001`, `lab-rehearsal-3`, `dev-jg-vm`.
    /// Examples that fail: `pi_001` (underscore), `Pi001` (uppercase),
    /// `-pi001` (leading dash), `pi001-` (trailing dash).
    #[arg(long)]
    device_id: String,

    /// Zitadel machine username for this device. Defaults to
    /// `device-<device_id>` so re-running with the same device_id
    /// reuses the same Zitadel user.
    #[arg(long)]
    device_username: Option<String>,

    /// Project-scoped Zitadel role to grant the device's user.
    /// Defaults to `device` — the role the auth callout maps to
    /// per-device-scoped pub/sub permissions.
    #[arg(long, default_value = "device")]
    device_role: String,

    /// Routing labels (`key=value,key=value`) the agent publishes in
    /// every DeviceInfo heartbeat.
    #[arg(long, default_value = "group=group-a")]
    labels: String,

    // ---- admin auth ------------------------------------------------------
    /// Pre-acquired Bearer token (PAT or out-of-band access token).
    /// When set, skips the browser device-code flow.
    #[arg(long, env = "HARMONY_ZITADEL_ADMIN_TOKEN")]
    admin_token: Option<String>,

    /// Zitadel OIDC `client_id` for the device-code app — the
    /// **numeric id** Zitadel assigns when the app is created (e.g.
    /// `371639797157987125@fleet`), NOT the human-readable app name
    /// (`harmony-cli`). The staging install prints this value in its
    /// final summary; copy it from there. Required when using SSO
    /// (omit only when `--admin-token` is set).
    #[arg(long)]
    admin_oidc_client_id: Option<String>,

    /// Forward to the agent's HTTP client AND to our admin-side calls
    /// to Zitadel. Set when talking to a dev cluster with a
    /// self-signed cert.
    #[arg(long)]
    danger_accept_invalid_certs: bool,

    /// Override the Zitadel **org context** (`x-zitadel-orgid` header)
    /// for management API calls. Set when the SSO operator's primary
    /// org differs from where the project + device users live —
    /// typical for human SSO accounts on a Zitadel where the project
    /// was provisioned by the system iam-admin (their org defaults
    /// don't match). Symptom: `Project '<name>' not found in
    /// Zitadel` even though the project clearly exists. Find the
    /// right value in Zitadel's admin UI → Organization → Resource
    /// ID, or via `/admin/v1/orgs/_search`.
    #[arg(long)]
    admin_org_id: Option<String>,

    // ---- agent binary ----------------------------------------------------
    /// Path to the cross-compiled fleet-agent binary that gets
    /// uploaded to the device and installed at /usr/local/bin/fleet-agent.
    /// Optional when `--launch-pi-vm` is set (no enrollment runs).
    #[arg(long)]
    agent_binary: Option<PathBuf>,

    // ---- VM rehearsal knobs (only relevant with --vm-rehearsal) ----------
    /// libvirt domain name for the rehearsal VM.
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long, default_value = "fleet-enroll-rehearsal")]
    vm_name: String,
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long, default_value = "default")]
    vm_network: String,
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long, default_value = "fleet-admin")]
    vm_admin_user: String,
    #[cfg(feature = "vm-rehearsal")]
    #[arg(long, default_value_t = 16)]
    vm_disk_size_gb: u32,
}

#[tokio::main]
async fn main() -> Result<()> {
    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
        .try_init()
        .ok();

    let cli = Cli::parse();

    #[cfg(feature = "vm-rehearsal")]
    if cli.launch_pi_vm {
        let vm_ip = boot_pi_rehearsal_vm(&cli).await?;
        println!();
        println!("=== Pi-equivalent VM ready ===");
        println!("VM:   {} (debian-trixie arm64)", cli.vm_name);
        println!("IP:   {vm_ip}");
        println!(
            "SSH:  ssh -i {} {}@{vm_ip}",
            harmony::modules::fleet::ensure_fleet_ssh_keypair()
                .await
                .map_err(|e| anyhow::anyhow!("ssh keypair: {e}"))?
                .private_key
                .display(),
            cli.vm_admin_user
        );
        println!();
        println!("To enroll this VM, run from your workstation:");
        println!(
            "  fleet_device_enroll \\\n    \
             --target ssh://{}@{vm_ip} \\\n    \
             --device-id <ID>            # required, RFC1123 (e.g. pi-001) \\\n    \
             --issuer-url <ISSUER> \\\n    \
             --audience <PROJECT_ID> \\\n    \
             --nats-url <NATS_URL> \\\n    \
             --admin-oidc-client-id <CLIENT_ID> \\\n    \
             --agent-binary <AGENT_BIN>",
            cli.vm_admin_user
        );
        return Ok(());
    }

    validate_device_id(&cli.device_id)?;
    let device_id = Id::from(cli.device_id.clone());
    let device_username = cli
        .device_username
        .clone()
        .unwrap_or_else(|| format!("device-{device_id}"));

    let labels = parse_labels(&cli.labels)?;
    let issuer_url = cli
        .issuer_url
        .clone()
        .context("--issuer-url is required for enrollment (omit only with --launch-pi-vm)")?;
    let audience = cli
        .audience
        .clone()
        .context("--audience is required for enrollment")?;
    let nats_url = cli
        .nats_url
        .clone()
        .context("--nats-url is required for enrollment")?;
    let agent_binary = cli
        .agent_binary
        .clone()
        .context("--agent-binary is required for enrollment")?;

    let auth = FleetDeviceAuth::ZitadelEnroll {
        oidc_issuer_url: issuer_url,
        audience,
        project_name: cli.project_name.clone(),
        device_username: device_username.clone(),
        device_display_name: format!("Fleet Device {device_id}"),
        device_role_keys: vec![cli.device_role.clone()],
        admin: match &cli.admin_token {
            Some(t) => AdminAuth::Token(t.clone()),
            None => AdminAuth::Sso {
                client_id: cli.admin_oidc_client_id.clone().context(
                    "--admin-oidc-client-id is required for SSO login. \
                     This is the **numeric** Zitadel client_id (e.g. \
                     `371639797157987125@fleet`), not the app name. \
                     The staging install prints it in its final summary. \
                     Alternatively, pass --admin-token <PAT> to skip SSO.",
                )?,
            },
        },
        admin_org_id: cli.admin_org_id.clone(),
        danger_accept_invalid_certs: cli.danger_accept_invalid_certs,
    };

    let setup_config = FleetDeviceSetupConfig {
        device_id: device_id.clone(),
        labels,
        nats_urls: vec![nats_url],
        auth,
        agent_binary_path: agent_binary,
        hosts_entries: vec![],
    };
    let setup_score = FleetDeviceSetupScore::new(setup_config);

    #[cfg(feature = "vm-rehearsal")]
    if cli.vm_rehearsal {
        let vm_ip = boot_rehearsal_vm(&cli).await?;
        let ssh = ensure_fleet_ssh_keypair()
            .await
            .map_err(|e| anyhow::anyhow!("ssh keypair: {e}"))?;
        let topology = LinuxHostTopology::new(
            format!("rehearsal-{}", cli.vm_name),
            vm_ip
                .parse()
                .context("rehearsal VM did not yield a valid IP")?,
            SshCredentials {
                user: cli.vm_admin_user.clone(),
                private_key_path: ssh.private_key.clone(),
                remote_python: Some("/usr/bin/python3".to_string()),
                sudo_password: None,
            },
        );
        run_setup(&setup_score, &topology).await?;
        println!(
            "✅ rehearsal device '{device_id}' enrolled via VM {} ({vm_ip})",
            cli.vm_name
        );
        return Ok(());
    }

    match cli.target.as_deref() {
        // No `--target` → run on the same machine. ansible's `-c
        // local` connection skips SSH entirely; sudo still works the
        // usual way (operator types the password if not configured
        // passwordless).
        None => {
            let topology = LinuxLocalhostTopology::new("localhost");
            run_setup(&setup_score, &topology).await?;
        }
        Some(target) => {
            let (user, host) = parse_ssh_target(target)?;
            let ssh = ensure_fleet_ssh_keypair()
                .await
                .map_err(|e| anyhow::anyhow!("ssh keypair: {e}"))?;
            let topology = LinuxHostTopology::new(
                format!("ssh-{host}"),
                host.parse().context("--target host is not a valid IP")?,
                SshCredentials {
                    user,
                    private_key_path: ssh.private_key.clone(),
                    remote_python: Some("/usr/bin/python3".to_string()),
                    sudo_password: None,
                },
            );
            run_setup(&setup_score, &topology).await?;
        }
    }
    println!("✅ device '{device_id}' enrolled");
    Ok(())
}

#[cfg(feature = "vm-rehearsal")]
async fn boot_rehearsal_vm(cli: &Cli) -> Result<String> {
    boot_vm(cli, RehearsalImage::Ubuntu).await
}

#[cfg(feature = "vm-rehearsal")]
async fn boot_pi_rehearsal_vm(cli: &Cli) -> Result<String> {
    boot_vm(cli, RehearsalImage::DebianTrixie).await
}

#[cfg(feature = "vm-rehearsal")]
#[derive(Debug, Clone, Copy)]
enum RehearsalImage {
    Ubuntu,
    DebianTrixie,
}

#[cfg(feature = "vm-rehearsal")]
async fn boot_vm(cli: &Cli, image: RehearsalImage) -> Result<String> {
    let arch = VmArchitecture::Aarch64;
    check_fleet_smoke_preflight_for_arch(arch)
        .await
        .map_err(|e| anyhow::anyhow!("preflight: {e}"))?;
    let base_image = match image {
        RehearsalImage::Ubuntu => {
            harmony::modules::fleet::ensure_ubuntu_2404_cloud_image_for_arch(arch)
                .await
                .map_err(|e| anyhow::anyhow!("cloud image: {e}"))?
        }
        RehearsalImage::DebianTrixie => {
            harmony::modules::fleet::ensure_debian_trixie_arm64_cloud_image()
                .await
                .map_err(|e| anyhow::anyhow!("debian cloud image: {e}"))?
        }
    };
    let pool = harmony::modules::fleet::ensure_harmony_fleet_pool()
        .await
        .map_err(|e| anyhow::anyhow!("libvirt pool: {e}"))?;
    let ssh = ensure_fleet_ssh_keypair()
        .await
        .map_err(|e| anyhow::anyhow!("ssh keypair: {e}"))?;
    let authorized_key = harmony::modules::fleet::read_public_key(&ssh)
        .await
        .map_err(|e| anyhow::anyhow!("read ssh pubkey: {e}"))?;

    let executor = init_executor().map_err(|e| anyhow::anyhow!("KVM init: {e}"))?;
    let vm_host = KvmVirtualMachineHost::new(
        "kvm-local",
        executor,
        pool.name.clone(),
        pool.path.clone(),
        base_image,
    );

    let vm_score = ProvisionVmScore {
        spec: VirtualMachineSpec {
            name: cli.vm_name.clone(),
            architecture: arch,
            cpus: 2,
            memory_mib: 2048,
            disk_size_gb: Some(cli.vm_disk_size_gb),
            network: cli.vm_network.clone(),
            first_boot: Some(VmFirstBootConfig {
                hostname: Some(cli.vm_name.clone()),
                admin_user: Some(cli.vm_admin_user.clone()),
                authorized_keys: vec![authorized_key],
                admin_password: None,
            }),
        },
    };

    use harmony::score::Score;
    let outcome = Score::<KvmVirtualMachineHost>::create_interpret(&vm_score)
        .execute(&Inventory::empty(), &vm_host)
        .await
        .map_err(|e| anyhow::anyhow!("ProvisionVmScore: {e}"))?;
    for d in &outcome.details {
        if let Some(v) = d.strip_prefix("ip=") {
            return Ok(v.to_string());
        }
    }
    anyhow::bail!("ProvisionVmScore finished without an IP")
}

async fn run_setup<T>(score: &FleetDeviceSetupScore, topology: &T) -> Result<()>
where
    T: harmony::topology::Topology + harmony::topology::LinuxHostConfiguration,
{
    use harmony::score::Score;
    let outcome = Score::<T>::create_interpret(score)
        .execute(&Inventory::empty(), topology)
        .await
        .map_err(|e| anyhow::anyhow!("FleetDeviceSetupScore: {e}"))?;
    println!("setup outcome: {} ({:?})", outcome.message, outcome.details);
    Ok(())
}

/// Validate `device_id` against RFC1123 subdomain rules so the
/// operator's downstream Device CR upsert can't fail with
/// `metadata.name: Invalid value`. See
/// https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-subdomain-names
///
/// Constraints applied here:
/// - non-empty, ≤253 chars total
/// - one or more dot-separated **labels**
/// - each label: 1-63 chars, lowercase alphanumeric + `-`, must start
///   AND end with an alphanumeric (no leading/trailing `-`)
///
/// We're stricter than just "kube name valid" because the same
/// device_id is also embedded in NATS subjects via the auth
/// callout's permission templates — and `_`/uppercase there silently
/// passes NATS but breaks the kube path. Rejecting upfront beats
/// debugging from three layers down.
fn validate_device_id(id: &str) -> Result<()> {
    if id.is_empty() {
        anyhow::bail!("device id is empty");
    }
    if id.len() > 253 {
        anyhow::bail!(
            "device id '{id}' is {len} chars, max 253 (RFC1123 subdomain limit)",
            len = id.len()
        );
    }
    for label in id.split('.') {
        validate_dns_label(label).with_context(|| format!("device id '{id}'"))?;
    }
    Ok(())
}

fn validate_dns_label(label: &str) -> Result<()> {
    if label.is_empty() {
        anyhow::bail!("empty label (consecutive dots or leading/trailing dot)");
    }
    if label.len() > 63 {
        anyhow::bail!(
            "label '{label}' is {len} chars, max 63 per RFC1123 label",
            len = label.len()
        );
    }
    let bytes = label.as_bytes();
    if !bytes[0].is_ascii_alphanumeric() {
        anyhow::bail!(
            "label '{label}' must start with an alphanumeric (got `{}`)",
            label.chars().next().unwrap()
        );
    }
    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
        anyhow::bail!(
            "label '{label}' must end with an alphanumeric (got `{}`)",
            label.chars().last().unwrap()
        );
    }
    for (i, c) in label.chars().enumerate() {
        let ok = c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-';
        if !ok {
            anyhow::bail!(
                "label '{label}' has invalid char `{c}` at position {i}; \
                 only lowercase a-z, 0-9, and `-` are allowed (no `_`, no uppercase)"
            );
        }
    }
    Ok(())
}

fn parse_ssh_target(target: &str) -> Result<(String, String)> {
    let rest = target
        .strip_prefix("ssh://")
        .context("--target must start with `ssh://` or be `localhost`")?;
    let (user, host) = rest
        .split_once('@')
        .context("--target must be `ssh://user@host`")?;
    if user.is_empty() || host.is_empty() {
        anyhow::bail!("--target ssh:// has empty user or host");
    }
    Ok((user.to_string(), host.to_string()))
}

fn parse_labels(raw: &str) -> Result<std::collections::BTreeMap<String, String>> {
    let mut out = std::collections::BTreeMap::new();
    for piece in raw.split(',').map(str::trim).filter(|p| !p.is_empty()) {
        let (k, v) = piece
            .split_once('=')
            .ok_or_else(|| anyhow::anyhow!("label '{piece}' missing '='"))?;
        let k = k.trim();
        let v = v.trim();
        if k.is_empty() || v.is_empty() {
            anyhow::bail!("label '{piece}' has empty key or value");
        }
        out.insert(k.to_string(), v.to_string());
    }
    if out.is_empty() {
        anyhow::bail!("--labels must include at least one key=value pair");
    }
    Ok(out)
}

#[cfg(test)]
mod tests {
    use super::validate_device_id;

    #[test]
    fn accepts_simple_labels() {
        for ok in [
            "pi",
            "pi-001",
            "lab-rehearsal-3",
            "dev-jg-vm",
            "a",
            "0",
            "fb5310-qm2kpoq",
            // multi-label subdomain
            "pi-001.lab-east.fleet",
        ] {
            assert!(
                validate_device_id(ok).is_ok(),
                "expected '{ok}' to be accepted: {:?}",
                validate_device_id(ok)
            );
        }
    }

    fn err_chain(e: anyhow::Error) -> String {
        // anyhow's `.to_string()` only renders the top-level context;
        // the validator emits the *cause* message (`invalid char …`,
        // `max 63`, etc.) further down the chain. `{:#}` renders the
        // full chain joined by `: ` which is what we want to match.
        format!("{e:#}")
    }

    #[test]
    fn rejects_underscore() {
        // The original `Id::default()` shape that triggered this fix.
        let err = err_chain(validate_device_id("fb5310_Qm2kPoQ").unwrap_err());
        assert!(err.contains("invalid char `_`"), "got: {err}");
    }

    #[test]
    fn rejects_uppercase() {
        let err = err_chain(validate_device_id("Pi001").unwrap_err());
        assert!(err.contains("invalid char"), "got: {err}");
    }

    #[test]
    fn rejects_leading_or_trailing_dash() {
        assert!(validate_device_id("-pi001").is_err());
        assert!(validate_device_id("pi001-").is_err());
    }

    #[test]
    fn rejects_empty() {
        assert!(validate_device_id("").is_err());
    }

    #[test]
    fn rejects_consecutive_dots() {
        assert!(validate_device_id("a..b").is_err());
    }

    #[test]
    fn rejects_too_long_label() {
        let long = "a".repeat(64);
        let err = err_chain(validate_device_id(&long).unwrap_err());
        assert!(err.contains("max 63"), "got: {err}");
    }

    #[test]
    fn rejects_too_long_total() {
        // 4 × (63 + 1) - 1 = 255 chars total; rejects on >253.
        let segment = "a".repeat(63);
        let id = [segment.as_str(); 4].join(".");
        assert!(id.len() > 253);
        let err = err_chain(validate_device_id(&id).unwrap_err());
        assert!(err.contains("max 253"), "got: {err}");
    }
}