Files
harmony/fleet/harmony-fleet-agent/src/config.rs
Jean-Gabriel Gill-Couture d013246a68 feat(fleet): request/reply commands over NATS — wire types, agent server, operator client, e2e harness
First slice of the device-commands.* protocol from
fleet/requests_over_nats.md. Lands `Verb::Ping` plus the harness that
proves it works against a real in-cluster agent.

Wire types (`harmony-reconciler-contracts::commands`):
- `Verb::Ping`, `CommandRequest`, `PingReply`, `ErrorReply`/`ErrorKind`
- `device_command_subject` / `device_command_subscription` helpers
- `X-Harmony-*` header constants

Agent:
- `command_server.rs` subscribes on `device-commands.<id>.>` and
  dispatches verbs; ping handler replies with `PingReply`
- New `[agent].runtime_enabled` config flag (default true). When
  false, podman init + reconciler loop are skipped so the agent can
  run as a Pod on containerd-only k3d nodes; command server +
  heartbeat still run
- `Dockerfile`: canonical multi-stage build for production registries

Operator:
- `commands::FleetCommandsClient` with typed `CommandError`
  (`DeviceOffline` via `no_responders`, `Timeout`, `BadReply`, `Nats`)

E2E harness (`harmony-fleet-e2e`):
- Library crate + integration test. `Stack::bring_up` provisions a
  fresh `e2e-<uuid8>` namespace in a shared `fleet-e2e` k3d cluster,
  deploys NATS (UserPass auth, JetStream on) + the agent Pod, returns
  a connected admin NATS client, and tears the namespace down on Drop
- v1 ships `AuthMode::UserPass` only; the `Callout` variant is
  reserved on the public API for the follow-up PR that adds the mock
  OIDC fixture + NatsAuthCalloutScore deployment
- Operator pod deployment is also follow-up — for ping the test
  process drives `FleetCommandsClient` directly against the cluster's
  NATS NodePort
- `HARMONY_FLEET_E2E=1` gates the integration test so default
  `cargo test --workspace` runs don't depend on k3d/podman
- Image build + sideload mirrors the `fleet_auth_callout` pattern:
  host `cargo build --release` → single-stage Dockerfile → `podman
  build` → `k3d image import`. ~12s warm bring-up, ~80s cold
2026-05-18 09:47:36 -04:00

150 lines
3.9 KiB
Rust

use harmony_reconciler_contracts::Id;
use serde::Deserialize;
use std::collections::BTreeMap;
use std::path::Path;
// Re-export the shared credential types so existing call sites keep
// working with `crate::config::CredentialsSection`. The struct itself
// lives in `harmony_fleet_auth` and is shared with the operator.
pub use harmony_fleet_auth::CredentialsSection;
#[derive(Debug, Clone, Deserialize)]
pub struct AgentConfig {
pub agent: AgentSection,
pub nats: NatsSection,
pub credentials: CredentialsSection,
/// Routing labels published verbatim in every DeviceInfo
/// heartbeat. The operator reflects them into
/// `Device.metadata.labels` so Deployment `spec.targetSelector`
/// resolves against them (K8s-Node-analogue flow). Empty by
/// default — a device with no labels is targetable only by its
/// auto-published `device-id` label.
#[serde(default)]
pub labels: BTreeMap<String, String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct AgentSection {
/// Cross-boundary device identity. TOML deserializes the field
/// as a bare string thanks to `#[serde(transparent)]` on `Id`.
pub device_id: Id,
/// When false, skip the podman socket + reconciler loop and run
/// only the heartbeat + command-server arms. Lets the agent run
/// on hosts without podman (e.g. the in-cluster e2e harness on
/// containerd-only k3d nodes). Default true so existing RPi
/// configs are unaffected.
#[serde(default = "default_runtime_enabled")]
pub runtime_enabled: bool,
}
fn default_runtime_enabled() -> bool {
true
}
#[derive(Debug, Clone, Deserialize)]
pub struct NatsSection {
pub urls: Vec<String>,
}
pub fn load_config(path: &Path) -> anyhow::Result<AgentConfig> {
let content = std::fs::read_to_string(path)?;
let config: AgentConfig = toml::from_str(&content)?;
Ok(config)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_toml_shared_credentials() {
let raw = r#"
[agent]
device_id = "pi-42"
runtime_enabled = true
[credentials]
type = "toml-shared"
nats_user = "u"
nats_pass = "p"
[nats]
urls = ["nats://nats:4222"]
[labels]
group = "site-a"
arch = "aarch64"
"#;
let cfg: AgentConfig = toml::from_str(raw).expect("valid config");
assert_eq!(cfg.labels.get("group"), Some(&"site-a".to_string()));
match &cfg.credentials {
CredentialsSection::TomlShared {
nats_user,
nats_pass,
} => {
assert_eq!(nats_user, "u");
assert_eq!(nats_pass, "p");
}
_ => panic!("expected TomlShared"),
}
}
#[test]
fn labels_section_optional_defaults_empty() {
let raw = r#"
[agent]
device_id = "pi-42"
[credentials]
type = "toml-shared"
nats_user = "u"
nats_pass = "p"
[nats]
urls = ["nats://nats:4222"]
"#;
let cfg: AgentConfig = toml::from_str(raw).expect("valid config");
assert!(cfg.labels.is_empty());
}
#[test]
fn runtime_enabled_defaults_to_true_when_omitted() {
// Existing RPi configs predate the runtime_enabled flag.
// Omitting it must keep podman+reconciler turned on — anything
// else silently downgrades a production agent.
let raw = r#"
[agent]
device_id = "pi-42"
[credentials]
type = "toml-shared"
nats_user = "u"
nats_pass = "p"
[nats]
urls = ["nats://nats:4222"]
"#;
let cfg: AgentConfig = toml::from_str(raw).expect("valid config");
assert!(cfg.agent.runtime_enabled);
}
#[test]
fn runtime_enabled_false_is_honored() {
let raw = r#"
[agent]
device_id = "pi-42"
runtime_enabled = false
[credentials]
type = "toml-shared"
nats_user = "u"
nats_pass = "p"
[nats]
urls = ["nats://nats:4222"]
"#;
let cfg: AgentConfig = toml::from_str(raw).expect("valid config");
assert!(!cfg.agent.runtime_enabled);
}
}