feat/prepare-rpi #280

Merged
stremblay merged 14 commits from feat/prepare-rpi into feat/iot-walking-skeleton 2026-05-04 17:28:45 +00:00
13 changed files with 605 additions and 65 deletions

14
Cargo.lock generated
View File

@@ -3194,6 +3194,20 @@ dependencies = [
"tokio",
]
[[package]]
name = "example_fleet_rpi_setup"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"harmony",
"harmony_cli",
"harmony_secret",
"harmony_types",
"log",
"tokio",
]
[[package]]
name = "example_fleet_vm_setup"
version = "0.1.0"

View File

@@ -0,0 +1,19 @@
[package]
name = "example_fleet_rpi_setup"
version.workspace = true
edition = "2024"
license.workspace = true
[[bin]]
name = "fleet_rpi_setup"
path = "src/main.rs"
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_secret = { path = "../../harmony_secret" }
harmony_types = { path = "../../harmony_types" }
tokio.workspace = true
log.workspace = true
anyhow.workspace = true
clap.workspace = true

View File

@@ -0,0 +1,4 @@
export HARMONY_SECRET_NAMESPACE=fleet-rpi-setup
export HARMONY_SECRET_STORE=file
export HARMONY_DATABASE_URL=sqlite://harmony_fleet_rpi_setup.sqlite
export RUST_LOG=info

View File

@@ -0,0 +1,191 @@
//! Onboard a real, already-booted Raspberry Pi into the IoT fleet.
//!
//! This is the physical-device sibling of `fleet_vm_setup`: the VM
//! provisioning step is gone (you booted Pi OS yourself with rpi-imager
//! and preloaded an SSH key), and we go straight to applying
//! `FleetDeviceSetupScore` over SSH. That score installs podman +
//! systemd-container, creates the `fleet-agent` user, drops the agent
//! binary + config + systemd unit, and starts the service.
//!
//! Source `env.sh` first (sets `HARMONY_SECRET_NAMESPACE`,
//! `HARMONY_SECRET_STORE`, `HARMONY_DATABASE_URL`, `RUST_LOG`), then:
//!
//! ```bash
//! source examples/fleet_rpi_setup/env.sh
//! cargo run -p example_fleet_rpi_setup -- --pi-host <ip> ...
//! ```
//!
//! Output rendering (per-step traces and the final recap) is handled
//! by `harmony_cli::run` — same as every other harmony example. The
//! score's `Outcome.details` is structured for that path.
//!
//! Prereqs on the Pi (one-time, via rpi-imager or manual):
//! - SSH server enabled
//! - An admin user with sudo. Passwordless sudo is detected and
//! used silently; otherwise the example prompts for a sudo
//! password via `SecretManager` and caches it for next runs.
//! - Your driver-machine SSH public key in that user's
//! `~/.ssh/authorized_keys`
//!
//! Prereqs on the driver machine (where this binary runs):
//! - Python 3 + `python3-venv` (Ansible is auto-bootstrapped into a venv)
//! - A cross-compiled `fleet-agent` binary for aarch64
use anyhow::{Context, Result};
use clap::Parser;
use harmony::config::secret::SudoPassword;
use harmony::inventory::Inventory;
use harmony::modules::fleet::{FleetDeviceSetupConfig, FleetDeviceSetupScore};
use harmony::modules::linux::{LinuxHostTopology, SshCredentials, ensure_ansible_venv, ssh_exec};
use harmony_secret::SecretManager;
use harmony_types::id::Id;
use log::info;
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(
name = "fleet_rpi_setup",
about = "Onboard a physical Raspberry Pi into the IoT fleet"
)]
struct Cli {
/// IP address of the Pi (e.g. 192.168.1.42).
#[arg(long)]
pi_host: String,
/// SSH user on the Pi with passwordless sudo.
#[arg(long, default_value = "pi")]
pi_user: String,
/// Path to the SSH private key whose public half is in the Pi
/// user's `~/.ssh/authorized_keys`.
#[arg(long, default_value = "~/.ssh/id_ed25519")]
ssh_key: PathBuf,
/// Device id the agent will announce to NATS. Defaults to a fresh
/// `Id` (sortable hex timestamp + random suffix).
#[arg(long)]
device_id: Option<String>,
/// Routing labels for `Deployment.spec.targetSelector` matching.
/// Comma-separated `key=value` pairs. At least one is required.
#[arg(long, default_value = "group=group-a,arch=aarch64")]
labels: String,
/// Path to the cross-compiled aarch64 fleet-agent binary on the
/// driver machine. Uploaded to `/usr/local/bin/fleet-agent`.
#[arg(long)]
agent_binary: PathBuf,
/// NATS URL the agent should connect to.
#[arg(long)]
nats_url: String,
#[arg(long, default_value = "smoke")]
nats_user: String,
#[arg(long, default_value = "smoke")]
nats_pass: String,
}
#[tokio::main]
async fn main() -> Result<()> {
harmony_cli::cli_logger::init();
let cli = Cli::parse();
ensure_ansible_venv()
.await
.map_err(|e| anyhow::anyhow!("ansible venv: {e}"))?;
let device_id = cli
.device_id
.clone()
.map(Id::from)
.unwrap_or_else(Id::default);
let ssh_key = expand_tilde(&cli.ssh_key);
let pi_ip = cli
.pi_host
.parse()
.with_context(|| format!("--pi-host '{}' is not a valid IP address", cli.pi_host))?;
let mut creds = SshCredentials {
user: cli.pi_user.clone(),
private_key_path: ssh_key,
// Pi OS Lite ships /usr/bin/python3 — skip auto-discovery.
remote_python: Some("/usr/bin/python3".to_string()),
sudo_password: None,
};
// If the Pi doesn't have passwordless sudo, fetch the password
// through SecretManager (same flow other scores use for SSH keys
// etc. — see harmony_secret/src/lib.rs:145). First run prompts;
// subsequent runs reuse the cached value. Probe with `sudo -n`
// first so we don't prompt the operator for a password they
// don't need.
let probe = ssh_exec(pi_ip, &creds, "sudo -n true", None)
.await
.map_err(|e| anyhow::anyhow!("sudo probe: {e}"))?;
if probe.rc != 0 {
info!("device requires a sudo password — fetching from secret store");
let secret = SecretManager::get_or_prompt::<SudoPassword>()
.await
.map_err(|e| anyhow::anyhow!("get sudo password: {e}"))?;
creds.sudo_password = Some(secret.password);
}
let topology = LinuxHostTopology::new(format!("rpi-{}", cli.pi_host), pi_ip, creds);
let labels = parse_labels(&cli.labels)?;
let score = FleetDeviceSetupScore::new(FleetDeviceSetupConfig {
device_id,
labels,
nats_urls: vec![cli.nats_url.clone()],
nats_user: cli.nats_user.clone(),
nats_pass: cli.nats_pass.clone(),
agent_binary_path: cli.agent_binary.clone(),
});
// We have our own clap CLI, so harmony_cli must NOT call
// `Args::parse()` (it would choke on --pi-host etc.). Pass an
// explicit Args with `yes: true` — the operator already committed
// to the run by typing the command, so the extra confirmation
// prompt would just add friction.
let harmony_args = harmony_cli::Args {
yes: true,
filter: None,
interactive: false,
all: true,
number: 0,
list: false,
};
harmony_cli::run(
Inventory::empty(),
topology,
vec![Box::new(score)],
Some(harmony_args),
)
.await
.map_err(|e| anyhow::anyhow!("{e}"))?;
Ok(())
}
fn parse_labels(raw: &str) -> Result<std::collections::BTreeMap<String, String>> {
let mut out = std::collections::BTreeMap::new();
for piece in raw.split(',').map(str::trim).filter(|p| !p.is_empty()) {
let (k, v) = piece
.split_once('=')
.ok_or_else(|| anyhow::anyhow!("label chunk '{piece}' missing '='"))?;
let k = k.trim();
let v = v.trim();
if k.is_empty() || v.is_empty() {
anyhow::bail!("label chunk '{piece}' has empty key or value");
}
out.insert(k.to_string(), v.to_string());
}
if out.is_empty() {
anyhow::bail!("--labels must include at least one key=value pair");
}
Ok(out)
}
fn expand_tilde(p: &std::path::Path) -> PathBuf {
let s = p.to_string_lossy();
if let Some(rest) = s.strip_prefix("~/") {
if let Ok(home) = std::env::var("HOME") {
return PathBuf::from(home).join(rest);
}
}
p.to_path_buf()
}

View File

@@ -196,6 +196,7 @@ async fn main() -> Result<()> {
user: cli.admin_user.clone(),
private_key_path: ssh.private_key.clone(),
remote_python: Some("/usr/bin/python3".to_string()),
sudo_password: None,
},
);

11
fleet/scripts/setup-rpi.sh Executable file
View File

@@ -0,0 +1,11 @@
cargo run -p example_fleet_rpi_setup -- \
--pi-host 192.168.2.19 \
--pi-user admin \
--ssh-key ~/.ssh/id_rsa \
--device-id paul \
--labels group=site-a,arch=aarch64 \
--agent-binary target/aarch64-unknown-linux-gnu/release/harmony-fleet-agent \
--nats-url nats://192.168.2.87:4222 \
--nats-user "" \
--nats-pass ""

View File

@@ -25,3 +25,15 @@ pub struct SshKeyPair {
pub struct RedhatSecret {
pub pull_secret: String,
}
/// Password used for **sudo escalation on a remote Linux host** — fed
/// to `sudo -S` (direct SSH calls) or to ansible's become-pass flow
/// when the bootstrap admin doesn't have passwordless sudo configured.
///
/// Not used for SSH login itself — `LinuxHostTopology` still requires
/// a `private_key_path` for that. SSH password auth is a possible
/// future extension; see the TODO on `SshCredentials`.
#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
pub struct SudoPassword {
pub password: String,
}

View File

@@ -51,6 +51,15 @@ pub trait FileDelivery: Send + Sync {
async fn ensure_file(&self, spec: &FileSpec) -> Result<ChangeReport, ExecutorError>;
}
/// Read the content of a remote file. Returns `None` if the file
/// doesn't exist; `Some(content)` otherwise. Useful for scores that
/// pre-flight-compare desired state against current state before
/// committing to a change.
#[async_trait]
pub trait FileFetcher: Send + Sync {
async fn fetch_file(&self, path: &str) -> Result<Option<String>, ExecutorError>;
}
/// Create and manage unix user accounts (POSIX systems).
///
/// Split from [`SystemdManager`] because some hosts run user accounts
@@ -110,12 +119,17 @@ pub trait SystemdManager: Send + Sync {
/// as a generic bound. Impls should implement each capability
/// individually.
pub trait LinuxHostConfiguration:
HostReachable + PackageInstaller + FileDelivery + UnixUserManager + SystemdManager
HostReachable + PackageInstaller + FileDelivery + FileFetcher + UnixUserManager + SystemdManager
{
}
impl<T> LinuxHostConfiguration for T where
T: HostReachable + PackageInstaller + FileDelivery + UnixUserManager + SystemdManager
T: HostReachable
+ PackageInstaller
+ FileDelivery
+ FileFetcher
+ UnixUserManager
+ SystemdManager
{
}

View File

@@ -8,7 +8,7 @@ use std::path::PathBuf;
use async_trait::async_trait;
use harmony_types::id::Id;
use log::{debug, info};
use log::{debug, info, warn};
use serde::{Deserialize, Serialize};
use crate::data::Version;
@@ -17,7 +17,7 @@ use crate::domain::interpret::{
};
use crate::domain::inventory::Inventory;
use crate::domain::topology::{
ChangeReport, FileDelivery, FileSource, FileSpec, HostReachable, LinuxHostConfiguration,
FileDelivery, FileFetcher, FileSource, FileSpec, HostReachable, LinuxHostConfiguration,
PackageInstaller, SystemdManager, SystemdScope, SystemdUnitSpec, Topology, UnixUserManager,
UserSpec,
};
@@ -182,16 +182,83 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
topology: &T,
) -> Result<Outcome, InterpretError> {
let cfg = &self.config;
HostReachable::ping(topology).await.map_err(wrap)?;
let tag = format!("FleetSetup/{}", cfg.device_id);
let mut change_log: Vec<String> = Vec::new();
info!("[{tag}] Pinging device {}", topology.name());
HostReachable::ping(topology).await.map_err(wrap)?;
info!("[{tag}] Device {} reachable", topology.name());
let mut change_count = 0usize;
// Pre-flight: detect a pre-existing config and decide whether
// to proceed silently, log a warning, or prompt the operator
// before letting downstream FileDelivery overwrite it.
let desired_config = cfg.render_toml();
info!("[{tag}] Step 1/7 — checking existing config on device");
match FileFetcher::fetch_file(topology, "/etc/fleet-agent/config.toml")
.await
.map_err(wrap)?
{
None => {
info!("[{tag}] no existing config — first install on this device");
}
Some(existing) if existing == desired_config => {
warn!(
"[{tag}] device already has /etc/fleet-agent/config.toml \
with identical content; converging anyway"
);
}
Some(existing) => {
warn!(
"[{tag}] device already configured with a DIFFERENT config — \
proceeding will OVERWRITE it"
);
warn!("[{tag}] diff (- existing, + desired):");
let diff = similar::TextDiff::from_lines(existing.as_str(), desired_config.as_str());
let groups = diff.grouped_ops(2);
for (idx, group) in groups.iter().enumerate() {
if idx > 0 {
warn!("[{tag}] ...");
}
for op in group {
for change in diff.iter_changes(op) {
let prefix = match change.tag() {
similar::ChangeTag::Equal => " ",
similar::ChangeTag::Delete => "-",
similar::ChangeTag::Insert => "+",
};
warn!(
"[{tag}] {} {}",
prefix,
change.value().trim_end_matches('\n')
);
}
}
}
let confirmed = inquire::Confirm::new(
"Device already has /etc/fleet-agent/config.toml with different content.\n \
Overwrite it and apply the new config?",
)
.with_default(false)
.prompt()
.map_err(|e| InterpretError::new(format!("User prompt failed: {e}")))?;
if !confirmed {
return Err(InterpretError::new(
"User aborted: refused to overwrite existing config".to_string(),
));
}
}
}
// 1. Dependencies.
info!("[{tag}] Step 2/7 — ensuring system packages: podman, systemd-container");
for pkg in ["podman", "systemd-container"] {
let r = PackageInstaller::ensure_package(topology, pkg)
.await
.map_err(wrap)?;
log_change(&mut change_log, format!("package:{pkg}"), r);
if r.changed {
change_count += 1;
}
}
// 2. fleet-agent user. Not `--system`: Ubuntu's useradd skips
@@ -205,6 +272,7 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
//
// Lingered so the user-systemd instance survives logout —
// required for the user podman.socket we enable below.
info!("[{tag}] Step 3/7 — ensuring fleet-agent user with linger");
let user_spec = UserSpec {
name: "fleet-agent".to_string(),
group: None,
@@ -213,28 +281,39 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
system: false,
create_home: true,
};
let r = UnixUserManager::ensure_user(topology, &user_spec)
let user_r = UnixUserManager::ensure_user(topology, &user_spec)
.await
.map_err(wrap)?;
log_change(&mut change_log, "user:fleet-agent", r);
if user_r.changed {
change_count += 1;
}
let r = UnixUserManager::ensure_linger(topology, "fleet-agent")
let linger_r = UnixUserManager::ensure_linger(topology, "fleet-agent")
.await
.map_err(wrap)?;
log_change(&mut change_log, "linger:fleet-agent", r);
if linger_r.changed {
change_count += 1;
}
// 3. User-scoped podman socket. Required by `PodmanTopology` on
// the agent so it reaches /run/user/<uid>/podman/podman.sock.
let r = SystemdManager::ensure_user_unit_active(topology, "fleet-agent", "podman.socket")
info!("[{tag}] Step 4/7 — activating user-scoped podman.socket");
let socket_r = SystemdManager::ensure_user_unit_active(topology, "fleet-agent", "podman.socket")
.await
.map_err(wrap)?;
log_change(&mut change_log, "user-unit:podman.socket", r);
if socket_r.changed {
change_count += 1;
}
// 4. Binary. Ship via ansible's native copy-from-local-file
// path (`FileSource::LocalPath`). Ansible handles binary
// content over SFTP and reports `changed: true` only when the
// remote file actually differs from the local one — so
// re-running this Score without a new binary is a true NOOP.
info!(
"[{tag}] Step 5/7 — uploading agent binary {} -> /usr/local/bin/fleet-agent",
cfg.agent_binary_path.display()
);
let binary_r = FileDelivery::ensure_file(
topology,
&FileSpec {
@@ -247,13 +326,21 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
)
.await
.map_err(wrap)?;
log_change(&mut change_log, "file:/usr/local/bin/fleet-agent", binary_r);
if binary_r.changed {
change_count += 1;
}
// 5. /etc/fleet-agent/ + config.toml
let config_toml = cfg.render_toml();
info!(
"[{tag}] Step 6/7 — rendering /etc/fleet-agent/config.toml ({} NATS URL{}, {} label{})",
cfg.nats_urls.len(),
if cfg.nats_urls.len() == 1 { "" } else { "s" },
cfg.labels.len(),
if cfg.labels.len() == 1 { "" } else { "s" },
);
let toml_spec = FileSpec {
path: "/etc/fleet-agent/config.toml".to_string(),
source: FileSource::Content(config_toml),
source: FileSource::Content(desired_config),
owner: Some("fleet-agent".to_string()),
group: Some("fleet-agent".to_string()),
mode: Some(0o600),
@@ -261,9 +348,12 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
let toml_r = FileDelivery::ensure_file(topology, &toml_spec)
.await
.map_err(wrap)?;
log_change(&mut change_log, "file:/etc/fleet-agent/config.toml", toml_r);
if toml_r.changed {
change_count += 1;
}
// 6. systemd unit for the agent itself.
info!("[{tag}] Step 7/7 — installing fleet-agent.service");
let unit = SystemdUnitSpec {
name: "fleet-agent".to_string(),
unit_content: cfg.render_systemd_unit().to_string(),
@@ -273,33 +363,72 @@ impl<T: Topology + LinuxHostConfiguration> Interpret<T> for FleetDeviceSetupInte
let unit_r = SystemdManager::ensure_systemd_unit(topology, &unit)
.await
.map_err(wrap)?;
log_change(&mut change_log, "unit:fleet-agent", unit_r);
if unit_r.changed {
change_count += 1;
}
// 7. Restart the agent iff anything that affects it changed.
let needs_restart = toml_r.changed || unit_r.changed || binary_r.changed;
if needs_restart {
let service_state = if needs_restart {
info!("[{tag}] 🔄 Restarting fleet-agent (config/binary/unit changed)");
SystemdManager::restart_service(topology, "fleet-agent", SystemdScope::System)
.await
.map_err(wrap)?;
change_log.push("restart:fleet-agent".to_string());
info!("fleet-agent restarted to pick up config/unit change");
change_count += 1;
"🔄 restarted to pick up changes"
} else {
debug!("fleet-agent config + unit unchanged; no restart");
}
debug!("[{tag}] No restart needed (config + unit + binary unchanged)");
"✅ running unchanged"
};
let outcome = if change_log.is_empty() {
Outcome::noop(format!("{} already configured", cfg.device_id))
// Build human-readable recap. Style matches OKDAddNodeScore:
// one fact per line, "Key: value" prefix, all the operationally
// useful state in a single Outcome details block so a CLI run
// shows the operator exactly what was done without chasing
// log levels.
let labels_display = if cfg.labels.is_empty() {
"(none)".to_string()
} else {
Outcome::success_with_details(
format!(
"{} configured ({} changes)",
cfg.device_id,
change_log.len()
),
change_log,
cfg.labels
.iter()
.map(|(k, v)| format!("{k}={v}"))
.collect::<Vec<_>>()
.join(", ")
};
let user_state = match (user_r.changed, linger_r.changed) {
(true, _) => "🔄 newly created, linger enabled",
(false, true) => "🔄 already present, linger newly enabled",
(false, false) => "✅ already present, lingered",
};
let details = vec![
format!("Device ID: {}", cfg.device_id),
format!("NATS URLs: {}", cfg.nats_urls.join(", ")),
format!("Labels: {labels_display}"),
format!("User: fleet-agent ({user_state})"),
format!(
"Agent binary: {} -> /usr/local/bin/fleet-agent",
cfg.agent_binary_path.display()
),
format!("Service: fleet-agent.service ({service_state})"),
];
let (status, msg) = if change_count == 0 {
info!("[{tag}] ✅ Done — no changes (device already configured)");
(
InterpretStatus::NOOP,
format!("{} already configured", cfg.device_id),
)
} else {
info!(
"[{tag}] 🎉 Done — {change_count} change{} applied",
if change_count == 1 { "" } else { "s" }
);
(
InterpretStatus::SUCCESS,
format!("{} configured ({change_count} changes)", cfg.device_id),
)
};
Ok(outcome)
Ok(Outcome::new(status, msg, details))
}
}
@@ -307,12 +436,6 @@ fn wrap(e: crate::executors::ExecutorError) -> InterpretError {
InterpretError::new(e.to_string())
}
fn log_change(change_log: &mut Vec<String>, what: impl Into<String>, r: ChangeReport) {
if r.changed {
change_log.push(what.into());
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -17,12 +17,14 @@
//! [`super::ansible_venv::ensure_ansible_venv`]. The operator does
//! *not* need to install `ansible` system-wide.
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::Stdio;
use harmony_types::net::IpAddress;
use serde::Serialize;
use serde_json::{Value, json};
use tempfile::NamedTempFile;
use tokio::process::Command;
use crate::domain::topology::{
@@ -31,9 +33,36 @@ use crate::domain::topology::{
use crate::executors::ExecutorError;
use super::ansible_venv::ensure_ansible_venv;
use super::ssh_executor::ssh_exec;
use super::ssh_executor::{SshCommandOutput, ssh_exec};
use super::topology::SshCredentials;
/// Run a command via `sudo` on the remote host, automatically choosing
/// between passwordless `sudo` and `sudo -S` (with the password piped
/// via stdin) based on `creds.sudo_password`. `cmd_after_sudo` is the
/// portion of the command line that comes after `sudo` — e.g. for
/// `sudo cat /etc/foo`, pass `"cat /etc/foo"`; for
/// `sudo -u fleet-agent env ... systemctl`, pass
/// `"-u fleet-agent env ... systemctl"`.
///
/// When chaining multiple privileged operations, fold them into one
/// `sudo` call (e.g. `sudo sh -c 'a && b'`) — `sudo -S` only reads
/// the password once, so two chained sudo calls would block on the
/// second prompt.
async fn sudo_exec(
host: IpAddress,
creds: &SshCredentials,
cmd_after_sudo: &str,
) -> Result<SshCommandOutput, ExecutorError> {
if let Some(pw) = &creds.sudo_password {
let line = format!("sudo -S {cmd_after_sudo}");
let stdin = format!("{pw}\n");
ssh_exec(host, creds, &line, Some(&stdin)).await
} else {
let line = format!("sudo {cmd_after_sudo}");
ssh_exec(host, creds, &line, None).await
}
}
pub struct AnsibleHostConfigurator;
impl AnsibleHostConfigurator {
@@ -148,6 +177,26 @@ impl AnsibleHostConfigurator {
.await
}
pub async fn fetch_file(
&self,
host: IpAddress,
creds: &SshCredentials,
path: &str,
) -> Result<Option<String>, ExecutorError> {
// Two SSH calls: existence probe, then read. Goes through
// sudo because the files we care about (e.g.
// /etc/fleet-agent/config.toml) are root- or service-owned
// and mode 0600 — readable only via privilege escalation.
let probe = sudo_exec(host, creds, &format!("test -e {path}")).await?;
if probe.rc != 0 {
return Ok(None);
}
let read = sudo_exec(host, creds, &format!("cat {path}"))
.await?
.into_successful()?;
Ok(Some(read.stdout))
}
pub async fn ensure_systemd_unit(
&self,
host: IpAddress,
@@ -257,12 +306,13 @@ impl AnsibleHostConfigurator {
host,
creds,
&format!("test -e /var/lib/systemd/linger/{user}"),
None,
)
.await?;
if check.rc == 0 {
return Ok(ChangeReport::NOOP);
}
ssh_exec(host, creds, &format!("sudo loginctl enable-linger {user}"))
sudo_exec(host, creds, &format!("loginctl enable-linger {user}"))
.await?
.into_successful()?;
Ok(ChangeReport::CHANGED)
@@ -279,23 +329,37 @@ impl AnsibleHostConfigurator {
// `XDG_RUNTIME_DIR` in the systemctl process env — a task-
// level `environment:` keyword only available in playbooks.
// Rather than pipe a one-task playbook, use russh directly:
// two small SSH calls, no Python wrapper, no inline YAML.
//
// Report `changed=true` unconditionally. systemctl
// enable --now is idempotent at the systemd level so re-
// running does no harm; reconcile-restart decisions
// upstream see only the outer-Score changes they care
// about (TOML and unit file changes), not this start-
// verification step.
let id_out = ssh_exec(host, creds, &format!("id -u {user}"))
// a probe + (optionally) an enable.
let id_out = ssh_exec(host, creds, &format!("id -u {user}"), None)
.await?
.into_successful()?;
let uid = id_out.stdout.trim();
let cmd = format!(
"sudo -u {user} env XDG_RUNTIME_DIR=/run/user/{uid} \
systemctl --user enable --now {unit}"
);
ssh_exec(host, creds, &cmd).await?.into_successful()?;
// Everything under sudo -u <user> with the user's
// XDG_RUNTIME_DIR. We fold both the probe (is-enabled +
// is-active chained) and the enable-now into a single sudo
// call each, because `sudo -S` only reads the password once
// — chaining two sudos would block on the second prompt.
let env_prefix = format!("-u {user} env XDG_RUNTIME_DIR=/run/user/{uid}");
// Already enabled and active → NOOP. `is-enabled --quiet`
// and `is-active --quiet` exit 0 only when the unit is
// both wanted and running, which is exactly the
// post-condition `enable --now` establishes.
let probe = sudo_exec(
host,
creds,
&format!(
"{env_prefix} sh -c 'systemctl --user is-enabled --quiet {unit} \
&& systemctl --user is-active --quiet {unit}'"
),
)
.await?;
if probe.rc == 0 {
return Ok(ChangeReport::NOOP);
}
let cmd = format!("{env_prefix} systemctl --user enable --now {unit}");
sudo_exec(host, creds, &cmd).await?.into_successful()?;
Ok(ChangeReport::CHANGED)
}
@@ -383,8 +447,25 @@ impl AnsibleHostConfigurator {
argv.push(format!("ansible_python_interpreter={py}"));
}
let output = Command::new(&bins.ansible)
.args(&argv)
// When a sudo password is configured, write it to a 0600
// tempfile and point ansible at it via ANSIBLE_BECOME_PASSWORD_FILE.
// The path goes in the env, not the value — `ps` on the driver
// sees a path, never the secret. The file is bound to a
// local that drops (and unlinks) at end of scope.
let pw_file: Option<NamedTempFile> = if let Some(pw) = &creds.sudo_password {
let mut f = NamedTempFile::new()
.map_err(|e| exec(format!("create become-password tempfile: {e}")))?;
f.write_all(pw.as_bytes())
.map_err(|e| exec(format!("write become-password tempfile: {e}")))?;
f.flush()
.map_err(|e| exec(format!("flush become-password tempfile: {e}")))?;
Some(f)
} else {
None
};
let mut cmd = Command::new(&bins.ansible);
cmd.args(&argv)
// Ad-hoc mode ignores ANSIBLE_STDOUT_CALLBACK unless
// ANSIBLE_LOAD_CALLBACK_PLUGINS is also set.
.env("ANSIBLE_LOAD_CALLBACK_PLUGINS", "True")
@@ -397,20 +478,40 @@ impl AnsibleHostConfigurator {
.env("ANSIBLE_PIPELINING", "True")
.env("ANSIBLE_SSH_CONTROL_PATH_DIR", control_path_dir())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.stderr(Stdio::piped());
if let Some(f) = &pw_file {
cmd.env("ANSIBLE_BECOME_PASSWORD_FILE", f.path());
}
let output = cmd
.output()
.await
.map_err(|e| exec(format!("spawn ansible: {e}")))?;
// pw_file dropped here → tempfile deleted.
drop(pw_file);
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
parse_oneline(&stdout).map_err(|parse_err| {
// For Ansible-parseable failures (UNREACHABLE!/FAILED!),
// `parse_err` already carries the verb + the human msg, so
// attaching the full rc/stderr/stdout envelope just
// duplicates the text. Only include the envelope when it
// adds debug signal: an unparseable stdout (real protocol
// mismatch) or a non-empty stderr.
let stderr = stderr.trim();
let already_parsed = parse_err.starts_with("UNREACHABLE!")
|| parse_err.starts_with("FAILED!");
if already_parsed && stderr.is_empty() {
return exec(format!(
"ansible module {module} failed against {host}: {parse_err}"
));
}
exec(format!(
"ansible module {module} failed against {host}: {parse_err} \
(ansible-exit={}, stderr={}, stdout={})",
output.status,
stderr.trim(),
stderr,
stdout.trim()
))
})

View File

@@ -54,6 +54,7 @@ pub async fn ssh_exec(
host: IpAddress,
creds: &SshCredentials,
command_line: &str,
stdin: Option<&str>,
) -> Result<SshCommandOutput, ExecutorError> {
let key_pair = load_secret_key(&creds.private_key_path, None).map_err(|e| {
ExecutorError::AuthenticationError(format!(
@@ -90,6 +91,19 @@ pub async fn ssh_exec(
.await
.map_err(|e| ExecutorError::NetworkError(format!("ssh exec: {e}")))?;
if let Some(input) = stdin {
channel
.data(input.as_bytes())
.await
.map_err(|e| ExecutorError::NetworkError(format!("ssh stdin write: {e}")))?;
// Signal EOF so the remote process's read on stdin returns,
// letting it proceed instead of hanging waiting for more.
channel
.eof()
.await
.map_err(|e| ExecutorError::NetworkError(format!("ssh stdin eof: {e}")))?;
}
let mut stdout = Vec::new();
let mut stderr = Vec::new();
let mut rc: Option<i32> = None;

View File

@@ -5,9 +5,9 @@ use harmony_types::net::IpAddress;
use serde::{Deserialize, Serialize};
use crate::domain::topology::{
ChangeReport, FileDelivery, FileSpec, HostReachable, PackageInstaller, PreparationError,
PreparationOutcome, SystemdManager, SystemdScope, SystemdUnitSpec, Topology, UnixUserManager,
UserSpec,
ChangeReport, FileDelivery, FileFetcher, FileSpec, HostReachable, PackageInstaller,
PreparationError, PreparationOutcome, SystemdManager, SystemdScope, SystemdUnitSpec, Topology,
UnixUserManager, UserSpec,
};
use crate::executors::ExecutorError;
@@ -34,8 +34,17 @@ pub struct LinuxHostTopology {
}
/// SSH credentials for reaching the host. Key-based only — password auth
/// is not supported here because the bootstrap story assumes a cloud-init
/// / rpi-imager / PXE step that preloads a public key.
/// for the SSH login itself is not supported here because the bootstrap
/// story assumes a cloud-init / rpi-imager / PXE step that preloads a
/// public key.
///
/// TODO: support SSH password auth as well — would mean making
/// `private_key_path` optional (or turning this into an enum
/// `Key { ... } | Password { ... }`), threading a password into both
/// `russh::authenticate_password` (`ssh_executor.rs`) and ansible's
/// `ANSIBLE_PASSWORD_FILE` / `-e ansible_ssh_pass=...` plumbing
/// (`ansible_configurator.rs`). Today's `sudo_password` field is
/// strictly for *sudo escalation on the remote*, not SSH login.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SshCredentials {
pub user: String,
@@ -45,6 +54,16 @@ pub struct SshCredentials {
/// wrong-interpreter traps). For Ubuntu-based cloud images `Some("/
/// usr/bin/python3")` is a safe, fast default.
pub remote_python: Option<String>,
/// Sudo password for `user`, used for **privilege escalation on the
/// remote host only** — fed to `sudo -S` for direct ssh_exec calls
/// and to ansible's become-pass machinery. **Not** used for SSH
/// login; that still requires `private_key_path`.
///
/// `None` = expect passwordless sudo on the host (the historical
/// assumption). Skipped from serde so it never lands in
/// cached/checkpointed inventories.
#[serde(skip)]
pub sudo_password: Option<String>,
}
impl LinuxHostTopology {
@@ -106,6 +125,15 @@ impl FileDelivery for LinuxHostTopology {
}
}
#[async_trait]
impl FileFetcher for LinuxHostTopology {
async fn fetch_file(&self, path: &str) -> Result<Option<String>, ExecutorError> {
self.configurator
.fetch_file(self.host, &self.credentials, path)
.await
}
}
#[async_trait]
impl UnixUserManager for LinuxHostTopology {
async fn ensure_user(&self, spec: &UserSpec) -> Result<ChangeReport, ExecutorError> {

View File

@@ -22,7 +22,15 @@ pub fn init() {
score: _,
outcome: Ok(outcome),
} => {
if outcome.status == harmony::interpret::InterpretStatus::SUCCESS {
// Surface details for both SUCCESS and NOOP — an
// idempotent re-run that converges to the desired
// state still has operationally useful info to
// share (current config, what's running, etc.).
if matches!(
outcome.status,
harmony::interpret::InterpretStatus::SUCCESS
| harmony::interpret::InterpretStatus::NOOP,
) {
details.extend(outcome.details.clone());
}
}