From 38f16b8d46ab668efbe7fb940d45a65c2365e0b5 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 10:29:45 -0400 Subject: [PATCH 01/38] refactor(opnsense): promote SSH bootstrap helpers to harmony::modules::opnsense::bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `create_api_key_ssh` and `change_lan_ip_via_ssh` were defined identically in both `opnsense_vm_integration` and `opnsense_pair_integration` example main.rs files. Lift them into `harmony::modules::opnsense::bootstrap` as `pub` free functions so future callers (including a forthcoming `OPNsenseBootstrapScore`) reuse a single canonical implementation. Also add `probe_https`, a one-shot reachability probe with a short timeout, which the bootstrap Score will use for its idempotency check. Behavior in the two examples is unchanged — they pass `"root"`/`"opnsense"` at their call sites, matching the hard-coded values the deleted local helpers used. Username/password are now parameters (validated against PHP-injection-prone characters), and `new_ip` in `change_lan_ip_via_ssh` is strict-parsed as `IpAddr` before interpolation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../opnsense_pair_integration/src/main.rs | 102 +--------- examples/opnsense_vm_integration/src/main.rs | 56 +----- harmony/src/modules/opnsense/bootstrap.rs | 176 ++++++++++++++++++ 3 files changed, 185 insertions(+), 149 deletions(-) diff --git a/examples/opnsense_pair_integration/src/main.rs b/examples/opnsense_pair_integration/src/main.rs index cefd071b..381fcf52 100644 --- a/examples/opnsense_pair_integration/src/main.rs +++ b/examples/opnsense_pair_integration/src/main.rs @@ -20,7 +20,6 @@ use std::net::IpAddr; use std::path::{Path, PathBuf}; -use std::sync::Arc; use harmony::config::secret::{OPNSenseApiCredentials, OPNSenseFirewallCredentials}; use harmony::infra::opnsense::OPNSenseFirewall; @@ -29,7 +28,9 @@ use harmony::modules::kvm::config::init_executor; use harmony::modules::kvm::{ BootDevice, ForwardMode, KvmExecutor, NetworkConfig, NetworkRef, VmConfig, }; -use harmony::modules::opnsense::bootstrap::OPNsenseBootstrap; +use harmony::modules::opnsense::bootstrap::{ + OPNsenseBootstrap, change_lan_ip_via_ssh, create_api_key_ssh, +}; use harmony::modules::opnsense::firewall::{FilterRuleDef, FirewallRuleScore}; use harmony::modules::opnsense::vip::VipDef; use harmony::modules::opnsense::vlan::{VlanDef, VlanScore}; @@ -158,7 +159,7 @@ async fn boot_pair( // Step 3: Change primary's LAN IP from .1 to .2 via API info!("Changing primary LAN IP to {PRIMARY_IP}..."); - change_lan_ip_via_ssh(BOOT_IP, PRIMARY_IP, 24).await?; + change_lan_ip_via_ssh(BOOT_IP, PRIMARY_IP, 24, "root", "opnsense").await?; // Step 4: Wait for primary to come back on new IP info!("Waiting for primary on new IP {PRIMARY_IP}:{API_PORT}..."); @@ -184,7 +185,7 @@ async fn boot_pair( // Step 7: Change backup's LAN IP from .1 to .3 via API info!("Changing backup LAN IP to {BACKUP_IP}..."); - change_lan_ip_via_ssh(BOOT_IP, BACKUP_IP, 24).await?; + change_lan_ip_via_ssh(BOOT_IP, BACKUP_IP, 24, "root", "opnsense").await?; // Step 8: Re-enable primary's LAN NIC info!("Re-enabling primary LAN NIC..."); @@ -247,48 +248,6 @@ async fn bootstrap_vm(role: &str, ip: &str) -> Result<(), Box Result<(), Box> { - use opnsense_config::config::{OPNsenseShell, SshCredentials, SshOPNSenseShell}; - - let ssh_config = Arc::new(russh::client::Config { - inactivity_timeout: None, - ..<_>::default() - }); - let credentials = SshCredentials::Password { - username: "root".to_string(), - password: "opnsense".to_string(), - }; - let ip: IpAddr = current_ip.parse()?; - let shell = SshOPNSenseShell::new((ip, 22), credentials, ssh_config); - - // Use a PHP script to update config.xml and apply - let php_script = format!( - r#"object()->interfaces->lan->ipaddr = '{new_ip}'; -$config->object()->interfaces->lan->subnet = '{subnet}'; -$config->save(); -echo "OK\n"; -"# - ); - - shell - .write_content_to_file(&php_script, "/tmp/change_ip.php") - .await?; - let output = shell - .exec("php /tmp/change_ip.php && rm /tmp/change_ip.php && configctl interface reconfigure lan") - .await?; - info!("IP change result: {}", output.trim()); - - Ok(()) -} - // ── Phase 2: Pair integration test ───────────────────────────────── async fn run_pair_test() -> Result<(), Box> { @@ -306,8 +265,8 @@ async fn run_pair_test() -> Result<(), Box> { info!("Creating API keys..."); let primary_ip: IpAddr = PRIMARY_IP.parse()?; let backup_ip: IpAddr = BACKUP_IP.parse()?; - let (primary_key, primary_secret) = create_api_key_ssh(&primary_ip).await?; - let (backup_key, backup_secret) = create_api_key_ssh(&backup_ip).await?; + let (primary_key, primary_secret) = create_api_key_ssh(&primary_ip, "root", "opnsense").await?; + let (backup_key, backup_secret) = create_api_key_ssh(&backup_ip, "root", "opnsense").await?; info!("API keys created for both firewalls"); // Build FirewallPairTopology @@ -641,50 +600,3 @@ async fn check_tcp_port(ip: &str, port: u16) -> bool { .map(|r| r.is_ok()) .unwrap_or(false) } - -async fn create_api_key_ssh(ip: &IpAddr) -> Result<(String, String), Box> { - use opnsense_config::config::{OPNsenseShell, SshCredentials, SshOPNSenseShell}; - - let ssh_config = Arc::new(russh::client::Config { - inactivity_timeout: None, - ..<_>::default() - }); - let credentials = SshCredentials::Password { - username: "root".to_string(), - password: "opnsense".to_string(), - }; - let shell = SshOPNSenseShell::new((*ip, 22), credentials, ssh_config); - - let php_script = r#"object()->system->user as $user) { - if ((string)$user->name === 'root') { - if (!isset($user->apikeys)) { $user->addChild('apikeys'); } - $item = $user->apikeys->addChild('item'); - $item->addChild('key', $key); - $item->addChild('secret', crypt($secret, '$6$' . bin2hex(random_bytes(8)) . '$')); - $config->save(); - echo $key . "\n" . $secret . "\n"; - exit(0); - } -} -echo "ERROR: root user not found\n"; -exit(1); -"#; - - shell - .write_content_to_file(php_script, "/tmp/create_api_key.php") - .await?; - let output = shell - .exec("php /tmp/create_api_key.php && rm /tmp/create_api_key.php") - .await?; - let lines: Vec<&str> = output.trim().lines().collect(); - if lines.len() >= 2 && !lines[0].starts_with("ERROR") { - Ok((lines[0].to_string(), lines[1].to_string())) - } else { - Err(format!("API key creation failed: {output}").into()) - } -} diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index ed43581a..b5e58ea5 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -20,7 +20,6 @@ use std::net::IpAddr; use std::path::{Path, PathBuf}; -use std::sync::Arc; use harmony::config::secret::{OPNSenseApiCredentials, OPNSenseFirewallCredentials}; use harmony::hardware::{HostCategory, PhysicalHost}; @@ -32,7 +31,7 @@ use harmony::modules::kvm::{ BootDevice, ForwardMode, KvmExecutor, NetworkConfig, NetworkRef, VmConfig, }; use harmony::modules::load_balancer::LoadBalancerScore; -use harmony::modules::opnsense::bootstrap::OPNsenseBootstrap; +use harmony::modules::opnsense::bootstrap::{OPNsenseBootstrap, create_api_key_ssh}; use harmony::modules::opnsense::dnat::{DnatRuleDef, DnatScore}; use harmony::modules::opnsense::firewall::{ BinatRuleDef, BinatScore, FilterRuleDef, FirewallRuleScore, OutboundNatScore, SnatRuleDef, @@ -248,7 +247,7 @@ async fn run_integration() -> Result<(), Box> { // Create API key info!("Creating API key via SSH..."); - let (api_key, api_secret) = create_api_key_ssh(&vm_ip).await?; + let (api_key, api_secret) = create_api_key_ssh(&vm_ip, "root", "opnsense").await?; info!("API key created: {}...", &api_key[..api_key.len().min(12)]); // Build topology @@ -1008,54 +1007,3 @@ fn make_host_binding(name: &str, ip: IpAddr, mac: [u8; 6]) -> HostBinding { }; HostBinding::new(logical, physical, HostConfig::new(None)) } - -async fn create_api_key_ssh(ip: &IpAddr) -> Result<(String, String), Box> { - use opnsense_config::config::{OPNsenseShell, SshCredentials, SshOPNSenseShell}; - - let ssh_config = Arc::new(russh::client::Config { - inactivity_timeout: None, - ..<_>::default() - }); - let credentials = SshCredentials::Password { - username: "root".to_string(), - password: "opnsense".to_string(), - }; - let shell = SshOPNSenseShell::new((*ip, 22), credentials, ssh_config); - - let php_script = r#"object()->system->user as $user) { - if ((string)$user->name === 'root') { - if (!isset($user->apikeys)) { $user->addChild('apikeys'); } - $item = $user->apikeys->addChild('item'); - $item->addChild('key', $key); - $item->addChild('secret', crypt($secret, '$6$' . bin2hex(random_bytes(8)) . '$')); - $config->save(); - echo $key . "\n" . $secret . "\n"; - exit(0); - } -} -echo "ERROR: root user not found\n"; -exit(1); -"#; - - info!("Writing API key script..."); - shell - .write_content_to_file(php_script, "/tmp/create_api_key.php") - .await?; - - info!("Executing API key generation..."); - let output = shell - .exec("php /tmp/create_api_key.php && rm /tmp/create_api_key.php") - .await?; - - let lines: Vec<&str> = output.trim().lines().collect(); - if lines.len() >= 2 && !lines[0].starts_with("ERROR") { - Ok((lines[0].to_string(), lines[1].to_string())) - } else { - Err(format!("API key creation failed: {output}").into()) - } -} diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index 6402df8e..25de1c45 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -438,6 +438,182 @@ impl OPNsenseBootstrap { } } +/// Reject strings that could break out of a single-quoted PHP string literal. +/// +/// Allows everything except the four characters that close, escape, or +/// inject across a `'...'` PHP literal. Empty strings are also rejected. +fn validate_php_safe(value: &str, field: &str) -> Result<(), BootstrapError> { + if value.is_empty() + || value + .chars() + .any(|c| matches!(c, '\'' | '"' | '\\' | '\n' | '\r' | '\0')) + { + return Err(BootstrapError::UnexpectedResponse(format!( + "Invalid {field}: empty or contains quote/backslash/newline" + ))); + } + Ok(()) +} + +/// One-shot HTTPS reachability probe with a short timeout. +/// +/// Returns `true` if the server responds within `timeout` (any HTTP status +/// counts as "reachable" — we accept self-signed certs since fresh OPNsense +/// uses one). Used by callers (e.g. the bootstrap Score) to decide whether +/// the firewall is on the vanilla address, the target address, both, or +/// neither. +pub async fn probe_https(host: &str, port: u16, timeout: std::time::Duration) -> bool { + let url = format!("https://{host}:{port}/"); + let client = match reqwest::Client::builder() + .danger_accept_invalid_certs(true) + .timeout(timeout) + .build() + { + Ok(c) => c, + Err(_) => return false, + }; + client.get(&url).send().await.is_ok() +} + +/// Build an `SshOPNSenseShell` against `(ip, 22)` using password authentication. +fn opnsense_ssh_shell( + ip: std::net::IpAddr, + username: &str, + password: &str, +) -> opnsense_config::config::SshOPNSenseShell { + use opnsense_config::config::{SshCredentials, SshOPNSenseShell}; + let ssh_config = std::sync::Arc::new(russh::client::Config { + inactivity_timeout: None, + ..<_>::default() + }); + let credentials = SshCredentials::Password { + username: username.to_string(), + password: password.to_string(), + }; + SshOPNSenseShell::new((ip, 22), credentials, ssh_config) +} + +/// Mint a fresh API key + secret on the OPNsense root user via SSH. +/// +/// SFTPs a short PHP script that appends an `` to the root user's +/// `` element in `config.xml`, executes it with the firewall's +/// `php` binary, then deletes the script. Returns `(key, secret)`. +/// +/// The PHP script uses `random_bytes` for both fields and `crypt` with a +/// SHA-512 salt for the stored secret — same scheme OPNsense uses when +/// keys are created via the web UI. +/// +/// Designed for fresh installs where the only known credentials are the +/// install-time defaults (`root` / `opnsense`); accepts arbitrary +/// credentials so the helper can be reused after a password rotation. +pub async fn create_api_key_ssh( + ip: &std::net::IpAddr, + username: &str, + password: &str, +) -> Result<(String, String), BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + validate_php_safe(username, "username")?; + validate_php_safe(password, "password")?; + + let shell = opnsense_ssh_shell(*ip, username, password); + let php = r#"object()->system->user as $user) { + if ((string)$user->name === 'root') { + if (!isset($user->apikeys)) { $user->addChild('apikeys'); } + $item = $user->apikeys->addChild('item'); + $item->addChild('key', $key); + $item->addChild('secret', crypt($secret, '$6$' . bin2hex(random_bytes(8)) . '$')); + $config->save(); + echo $key . "\n" . $secret . "\n"; + exit(0); + } +} +echo "ERROR: root user not found\n"; +exit(1); +"#; + shell + .write_content_to_file(php, "/tmp/create_api_key.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + let output = shell + .exec("php /tmp/create_api_key.php && rm /tmp/create_api_key.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SSH exec failed: {e}")))?; + let lines: Vec<&str> = output.trim().lines().collect(); + if lines.len() >= 2 && !lines[0].starts_with("ERROR") { + Ok((lines[0].to_string(), lines[1].to_string())) + } else { + Err(BootstrapError::UnexpectedResponse(format!( + "API key creation failed on firewall: {output}" + ))) + } +} + +/// Move the LAN interface to a new IP / subnet at runtime via SSH. +/// +/// SFTPs a PHP script that rewrites `interfaces.lan.ipaddr` and +/// `interfaces.lan.subnet` in `config.xml`, then runs +/// `configctl interface reconfigure lan` so the change takes effect +/// without a reboot. The OPNsense webserver will respond on the new IP +/// within a few seconds. +/// +/// **Connectivity warning:** if the caller is on the LAN side of the +/// firewall, this call will sever their connection to the firewall before +/// it returns — they need to reattach into the new subnet. This helper +/// does not (and cannot) assist with that. +/// +/// `new_ip` is strictly parsed as an `IpAddr` before interpolation; +/// `username` / `password` are validated against PHP-injection-safe +/// characters. +pub async fn change_lan_ip_via_ssh( + current_ip: &str, + new_ip: &str, + subnet: u8, + username: &str, + password: &str, +) -> Result<(), BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + validate_php_safe(username, "username")?; + validate_php_safe(password, "password")?; + let _: std::net::IpAddr = new_ip + .parse() + .map_err(|e| BootstrapError::UnexpectedResponse(format!("Invalid new LAN IP: {e}")))?; + + let ip: std::net::IpAddr = current_ip + .parse() + .map_err(|e| BootstrapError::UnexpectedResponse(format!("Invalid current SSH IP: {e}")))?; + let shell = opnsense_ssh_shell(ip, username, password); + let php = format!( + r#"object()->interfaces->lan->ipaddr = '{new_ip}'; +$config->object()->interfaces->lan->subnet = '{subnet}'; +$config->save(); +echo "OK\n"; +"# + ); + shell + .write_content_to_file(&php, "/tmp/change_ip.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + let out = shell + .exec( + "php /tmp/change_ip.php && rm /tmp/change_ip.php \ + && configctl interface reconfigure lan", + ) + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SSH exec failed: {e}")))?; + info!("LAN IP change via SSH: {}", out.trim()); + Ok(()) +} + /// Extract the CSRF token field name and value from an OPNsense HTML page. /// /// OPNsense embeds CSRF tokens as hidden inputs with a dynamic field name. -- 2.39.5 From 4693930e63b84566668baa2703f5e2c366469c57 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 10:30:02 -0400 Subject: [PATCH 02/38] feat(opnsense): OPNsenseBootstrapScore + OPNsenseBootstrapTopology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Solve the OPNsense bootstrap chicken-and-egg problem with a Score-shaped abstraction. Until now, every binary deploying onto a fresh OPNsense had to copy ~80 lines of procedural orchestration (login → abort wizard → SSH → port move → API key mint → LAN flip) into its own main.rs, because the bootstrap creates the very credentials that `OPNSenseFirewall` needs to construct. The trick: a separate, minimal `OPNsenseBootstrapTopology` that holds only {vanilla_ip, default_username, default_password}. The new `Score` runs the dance from `Interpret::execute`, persists `OPNSenseApiCredentials` and `OPNSenseFirewallCredentials` to `SecretManager`, and optionally rebinds the LAN. The calling binary then builds a normal `OPNSenseFirewall` from the now-stored credentials and runs `Score` composition against it — two Maestro phases in sequence, SecretManager as the bridge. Idempotency is handled by a 4-boolean decision matrix (api_creds_exist, ssh_creds_exist, vanilla_reachable, target_reachable) extracted into a pure helper and table-tested. The Score is safe to re-run: NOOP when already bootstrapped, DANCE on first-run or partial resume, FAILURE with clear recovery instructions when target is up but secrets are lost (factory-reset and re-run). Output follows the precedent of `OKDAddNodeScore`: - `[OPNsenseBootstrap/{vanilla_ip}]`-prefixed log lines, one info! per state change - Runbook-shaped Outcome::success_with_details listing where the firewall now lives, where credentials were stored, and the manual reconnect step if a LAN rebind happened - Multi-sentence InterpretError messages including the recovery path Includes a new `OPNsenseBootstrap` variant on `InterpretName`. Unit tests cover Score name, serialization, the full idempotency decision matrix, and `ensure_ready` failure when the firewall is unreachable. Scope: abstraction-only. Example main.rs files keep their current procedural shape; refactoring them to compose the new Score is a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/domain/interpret/mod.rs | 2 + harmony/src/domain/topology/mod.rs | 2 + .../src/domain/topology/opnsense_bootstrap.rs | 89 ++++ .../src/modules/opnsense/bootstrap_score.rs | 457 ++++++++++++++++++ harmony/src/modules/opnsense/mod.rs | 1 + 5 files changed, 551 insertions(+) create mode 100644 harmony/src/domain/topology/opnsense_bootstrap.rs create mode 100644 harmony/src/modules/opnsense/bootstrap_score.rs diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index ec3ad1d0..73ef3870 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -11,6 +11,7 @@ use super::{ pub enum InterpretName { OPNSenseDHCP, OPNSenseDns, + OPNsenseBootstrap, LoadBalancer, Tftp, Http, @@ -44,6 +45,7 @@ impl std::fmt::Display for InterpretName { match self { InterpretName::OPNSenseDHCP => f.write_str("OPNSenseDHCP"), InterpretName::OPNSenseDns => f.write_str("OPNSenseDns"), + InterpretName::OPNsenseBootstrap => f.write_str("OPNsenseBootstrap"), InterpretName::LoadBalancer => f.write_str("LoadBalancer"), InterpretName::Tftp => f.write_str("Tftp"), InterpretName::Http => f.write_str("Http"), diff --git a/harmony/src/domain/topology/mod.rs b/harmony/src/domain/topology/mod.rs index 44c35b4e..e38a2b45 100644 --- a/harmony/src/domain/topology/mod.rs +++ b/harmony/src/domain/topology/mod.rs @@ -5,9 +5,11 @@ mod ha_cluster; pub mod ingress; pub mod node_exporter; pub mod opnsense; +pub mod opnsense_bootstrap; pub use failover::*; pub use firewall_pair::*; use harmony_types::net::IpAddress; +pub use opnsense_bootstrap::*; mod host_binding; mod http; pub mod installable; diff --git a/harmony/src/domain/topology/opnsense_bootstrap.rs b/harmony/src/domain/topology/opnsense_bootstrap.rs new file mode 100644 index 00000000..784fc268 --- /dev/null +++ b/harmony/src/domain/topology/opnsense_bootstrap.rs @@ -0,0 +1,89 @@ +//! Minimal topology representing a factory-fresh OPNsense firewall. +//! +//! [`OPNsenseBootstrapTopology`] holds the connection info needed to talk to +//! an OPNsense that has just been installed from ISO and is reachable at its +//! default LAN IP with the install-time credentials. It exists so that the +//! `OPNsenseBootstrapScore` (in `harmony::modules::opnsense::bootstrap_score`) +//! can fit the standard `Score` pattern while the firewall is +//! still pre-API-credentials. +//! +//! Once the bootstrap Score runs, callers construct an +//! [`OPNSenseFirewall`](crate::infra::opnsense::OPNSenseFirewall) instead and +//! run their production-phase Scores against that. + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + modules::opnsense::bootstrap::probe_https, + topology::{PreparationError, PreparationOutcome, Topology}, +}; +use harmony_types::net::IpAddress; + +/// A factory-fresh OPNsense firewall awaiting first-time configuration. +/// +/// The struct is intentionally tiny — it carries only what's needed to +/// reach the firewall and authenticate with the install-time defaults. +/// All "where do you want to end up" configuration (target API port, +/// optional LAN rebind, timeouts) belongs on the Score, not here. +#[derive(Debug, Clone, Serialize)] +pub struct OPNsenseBootstrapTopology { + /// LAN IP the OPNsense was configured with at install time + /// (typically `192.168.1.1`). + pub vanilla_ip: IpAddress, + /// Install-time username (typically `root`). + pub default_username: String, + /// Install-time password (typically `opnsense`). + pub default_password: String, +} + +#[async_trait] +impl Topology for OPNsenseBootstrapTopology { + fn name(&self) -> &str { + "OPNsenseBootstrapTopology" + } + + /// Probe the vanilla address on TCP 443. If unreachable, return a + /// `PreparationError` whose message points the operator at the + /// typical recovery paths (install from ISO, leave LAN at default, + /// or — if the firewall is already past first-boot — run the + /// bootstrap Score's idempotency check from the target subnet). + async fn ensure_ready(&self) -> Result { + let ip_str = self.vanilla_ip.to_string(); + if probe_https(&ip_str, 443, std::time::Duration::from_secs(3)).await { + Ok(PreparationOutcome::Success { + details: format!("Factory-fresh OPNsense reachable at https://{ip_str}"), + }) + } else { + Err(PreparationError::new(format!( + "Could not reach factory-fresh OPNsense at https://{ip_str}:443 within 3s. \ + Verify it is installed from ISO, sitting at its default LAN IP, and the dev \ + machine is on the same subnet. If you've already bootstrapped this firewall \ + once, you don't need to rerun the bootstrap Score from here — its idempotency \ + check expects the target subnet instead." + ))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn ensure_ready_errors_when_endpoint_is_unreachable() { + // 127.0.0.1:1 is the conventional "nothing listens here" target. + // Note: the probe targets port 443, not the IP's literal port, + // so this exercises the same code path even if something is on :1. + let topology = OPNsenseBootstrapTopology { + vanilla_ip: "127.0.0.1".parse().unwrap(), + default_username: "root".into(), + default_password: "opnsense".into(), + }; + let result = topology.ensure_ready().await; + assert!( + result.is_err(), + "expected ensure_ready to fail against an unreachable endpoint, got Ok({result:?})" + ); + } +} diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs new file mode 100644 index 00000000..53164e40 --- /dev/null +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -0,0 +1,457 @@ +//! `OPNsenseBootstrapScore` — declarative wrapper around the OPNsense +//! first-boot procedure. +//! +//! Targets the minimal [`OPNsenseBootstrapTopology`], which represents a +//! factory-fresh OPNsense reachable at its default LAN IP with the +//! install-time root password. Running this Score: +//! +//! 1. Logs into the web UI, aborts the initial setup wizard, enables SSH. +//! 2. Moves the web GUI from port 443 to `target_api_port`. +//! 3. SSHes in, mints an API key + secret on the root user, and persists +//! both `OPNSenseApiCredentials` and `OPNSenseFirewallCredentials` to +//! `harmony_secret::SecretManager`. +//! 4. Optionally rebinds the LAN to a new IP/subnet. +//! +//! After it runs, callers construct a normal +//! [`OPNSenseFirewall`](crate::infra::opnsense::OPNSenseFirewall) from the +//! now-stored credentials and run `Score` composition +//! against it — that's where production configuration lives. +//! +//! # Side effects +//! +//! This Score writes to `SecretManager`. That's an acknowledged exception +//! to Score purity: the credentials *are* the Score's output, and they +//! must live somewhere durable so the second-phase topology can read them +//! back. It's the same model `SecretManager::get_or_prompt` already uses. + +use async_trait::async_trait; +use harmony_secret::SecretManager; +use harmony_types::id::Id; +use harmony_types::net::IpAddress; +use log::{info, warn}; +use serde::Serialize; + +use crate::{ + config::secret::{OPNSenseApiCredentials, OPNSenseFirewallCredentials}, + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::opnsense::bootstrap::{ + OPNsenseBootstrap, change_lan_ip_via_ssh, create_api_key_ssh, probe_https, + }, + score::Score, + topology::OPNsenseBootstrapTopology, +}; + +/// New LAN address to apply at the end of the bootstrap. +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct LanRebind { + pub new_ip: IpAddress, + pub prefix: u8, +} + +/// Bring a factory-fresh OPNsense to a Harmony-driveable state, ending it +/// at a known port and (optionally) a new LAN address. +#[derive(Debug, Clone, Serialize)] +pub struct OPNsenseBootstrapScore { + /// HTTPS port the web GUI / API will end up on (typically `9443`). + pub target_api_port: u16, + /// If `Some`, the LAN interface is rebound to this address as the + /// final dance step. If `None`, the LAN stays where it was. + pub target_lan: Option, + /// How long to wait for the web GUI to come up on `target_api_port` + /// after the port move (default: 120s). + pub webgui_ready_timeout: std::time::Duration, + /// Disable OPNsense's automatic HTTP→HTTPS redirect on port 80. + /// Required when something else needs to bind `0.0.0.0:80` (e.g. + /// HAProxy on a CARP VIP). + pub disable_http_redirect: bool, +} + +impl Default for OPNsenseBootstrapScore { + fn default() -> Self { + Self { + target_api_port: 9443, + target_lan: None, + webgui_ready_timeout: std::time::Duration::from_secs(120), + disable_http_redirect: false, + } + } +} + +impl Score for OPNsenseBootstrapScore { + fn name(&self) -> String { + "OPNsenseBootstrapScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OPNsenseBootstrapInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug)] +struct OPNsenseBootstrapInterpret { + score: OPNsenseBootstrapScore, +} + +/// The three terminal branches of the idempotency check. +#[derive(Debug, Clone, PartialEq, Eq)] +enum Decision { + Noop, + Dance, + Failure(String), +} + +/// Decide what the Interpret should do given the current observed state. +/// +/// Pure function over the four booleans so the matrix is unit-testable +/// without touching the network or the secret store. +fn decide( + api_creds_exist: bool, + ssh_creds_exist: bool, + vanilla_reachable: bool, + target_reachable: bool, +) -> Decision { + match ( + api_creds_exist, + ssh_creds_exist, + vanilla_reachable, + target_reachable, + ) { + // Already bootstrapped: vanilla gone, target up, both cred sets present. + (true, true, false, true) => Decision::Noop, + // Vanilla still answering — clean first run or mid-flight resume. + // The dance's individual steps are idempotent on the firewall side, + // so re-running a 90%-done bootstrap is cheap. + (_, _, true, _) => Decision::Dance, + // Vanilla gone, target up, but at least one cred set is missing — + // partial bootstrap that lost its secrets. + (false, _, false, true) | (_, false, false, true) => Decision::Failure( + "Detected a partial bootstrap: OPNsense answers at the target address but at least \ + one of OPNSenseApiCredentials / OPNSenseFirewallCredentials is missing from the \ + secret store. The factory-fresh state at the vanilla address is gone, so a fresh \ + key cannot be minted. Factory-reset the firewall (console menu option 4) and \ + re-run, or restore the lost credentials from your backup." + .to_string(), + ), + // Catch-all: nothing reachable anywhere. + _ => Decision::Failure( + "Firewall not reachable at either the vanilla address or the target address. \ + Check power, network cables, and dev-machine subnet membership." + .to_string(), + ), + } +} + +#[async_trait] +impl Interpret for OPNsenseBootstrapInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &OPNsenseBootstrapTopology, + ) -> Result { + let vanilla_ip = topology.vanilla_ip.to_string(); + let tag = format!("[OPNsenseBootstrap/{vanilla_ip}]"); + let probe_timeout = std::time::Duration::from_secs(3); + + // ── Step 1: idempotency probe ──────────────────────────────── + let target_ip_str = match &self.score.target_lan { + Some(rebind) => rebind.new_ip.to_string(), + None => vanilla_ip.clone(), + }; + let target_reachable = + probe_https(&target_ip_str, self.score.target_api_port, probe_timeout).await; + let vanilla_reachable = probe_https(&vanilla_ip, 443, probe_timeout).await; + let api_creds_exist = SecretManager::get::().await.is_ok(); + let ssh_creds_exist = SecretManager::get::() + .await + .is_ok(); + + info!( + "{tag} Idempotency probe: vanilla_reachable={vanilla_reachable}, \ + target_reachable={target_reachable}, api_creds_exist={api_creds_exist}, \ + ssh_creds_exist={ssh_creds_exist}" + ); + + match decide( + api_creds_exist, + ssh_creds_exist, + vanilla_reachable, + target_reachable, + ) { + Decision::Noop => { + info!( + "{tag} NOOP — firewall already bootstrapped and reachable at \ + https://{target_ip_str}:{}", + self.score.target_api_port + ); + return Ok(Outcome::noop(format!( + "OPNsense already bootstrapped at {target_ip_str}:{}; nothing to do", + self.score.target_api_port + ))); + } + Decision::Failure(reason) => { + return Err(InterpretError::new(reason)); + } + Decision::Dance => { + if api_creds_exist && ssh_creds_exist { + info!("{tag} DANCE — resuming from partial state (creds present)"); + } else { + info!("{tag} DANCE — starting fresh bootstrap from vanilla state"); + } + } + } + + // ── Step 2: web UI bootstrap dance ─────────────────────────── + let base_url = format!("https://{vanilla_ip}"); + let bootstrap = OPNsenseBootstrap::new(&base_url); + + bootstrap + .login(&topology.default_username, &topology.default_password) + .await + .map_err(|e| { + InterpretError::new(format!( + "Failed to log in to OPNsense web UI at {base_url}: {e}. Confirm the \ + firewall is at the install-time defaults — root password unchanged, \ + wizard not completed, web GUI still on 443." + )) + })?; + info!("{tag} Logged in to web UI as {}", topology.default_username); + + bootstrap + .abort_wizard() + .await + .map_err(|e| InterpretError::new(format!("Failed to abort setup wizard: {e}")))?; + info!("{tag} Aborted initial setup wizard"); + + bootstrap + .enable_ssh(true, true) + .await + .map_err(|e| InterpretError::new(format!("Failed to enable SSH: {e}")))?; + info!("{tag} Enabled SSH (root login, password auth)"); + + bootstrap + .set_webgui_port( + self.score.target_api_port, + &vanilla_ip, + self.score.disable_http_redirect, + ) + .await + .map_err(|e| InterpretError::new(format!("Failed to move web GUI port: {e}")))?; + info!( + "{tag} Moved web GUI port 443 -> {}", + self.score.target_api_port + ); + + let new_url = format!("https://{vanilla_ip}:{}", self.score.target_api_port); + OPNsenseBootstrap::wait_for_ready(&new_url, self.score.webgui_ready_timeout) + .await + .map_err(|e| { + InterpretError::new(format!( + "Web UI did not respond on {new_url} within {:?}: {e}", + self.score.webgui_ready_timeout + )) + })?; + info!("{tag} Web UI ready at {new_url}"); + + // ── Step 3: mint API key & persist secrets ─────────────────── + // Persist BEFORE the LAN flip — if the LAN flip fails mid-execution, + // the operator can re-run; the dance branch picks up at "creds present, + // vanilla still reachable" and retries the rebind. + let (key, secret) = create_api_key_ssh( + &topology.vanilla_ip, + &topology.default_username, + &topology.default_password, + ) + .await + .map_err(|e| InterpretError::new(format!("Failed to mint API key over SSH: {e}")))?; + let key_prefix = &key[..key.len().min(12)]; + info!("{tag} Minted API key (key={key_prefix}…)"); + + SecretManager::set(&OPNSenseApiCredentials { + key: key.clone(), + secret: secret.clone(), + }) + .await?; + SecretManager::set(&OPNSenseFirewallCredentials { + username: topology.default_username.clone(), + password: topology.default_password.clone(), + }) + .await?; + info!("{tag} Persisted OPNSenseApiCredentials + OPNSenseFirewallCredentials"); + + // ── Step 4: optional LAN rebind ────────────────────────────── + if let Some(rebind) = &self.score.target_lan { + info!( + "{tag} LAN rebind {vanilla_ip} -> {}/{}", + rebind.new_ip, rebind.prefix + ); + change_lan_ip_via_ssh( + &vanilla_ip, + &rebind.new_ip.to_string(), + rebind.prefix, + &topology.default_username, + &topology.default_password, + ) + .await + .map_err(|e| { + InterpretError::new(format!( + "Persisted credentials successfully but the LAN-rebind step failed: {e}. \ + The firewall is still reachable at {vanilla_ip}; re-running this Score \ + will pick up at the rebind step (idempotency: creds present, vanilla up)." + )) + })?; + + // Best-effort post-flip probe. Connectivity from the dev machine to + // the new subnet is a physical concern outside this Score's control. + let post_url = rebind.new_ip.to_string(); + let post_probe = probe_https( + &post_url, + self.score.target_api_port, + std::time::Duration::from_secs(5), + ) + .await; + if !post_probe { + warn!( + "{tag} Could not confirm reachability at https://{post_url}:{} after the \ + LAN rebind. The firewall may need a few seconds to settle, or your dev \ + machine is no longer on the firewall's subnet — reconnect and verify \ + manually.", + self.score.target_api_port + ); + } + } + + // ── Build the success Outcome (runbook-shaped details) ─────── + let final_ip = match &self.score.target_lan { + Some(rebind) => rebind.new_ip.to_string(), + None => vanilla_ip.clone(), + }; + let lan_line = match &self.score.target_lan { + Some(rebind) => format!( + " Final IP: {}/{} (LAN rebind applied)", + rebind.new_ip, rebind.prefix + ), + None => format!(" Final IP: {vanilla_ip} (no LAN rebind)"), + }; + + let mut details = vec![ + "OPNsense bootstrap complete".to_string(), + String::new(), + format!(" Vanilla IP: {vanilla_ip}"), + lan_line, + format!( + " Web UI: https://{final_ip}:{}", + self.score.target_api_port + ), + format!(" SSH: {}@{final_ip}", topology.default_username), + " API creds: stored as OPNSenseApiCredentials in SecretManager".to_string(), + " SSH creds: stored as OPNSenseFirewallCredentials in SecretManager".to_string(), + ]; + if self.score.target_lan.is_some() { + details.push(String::new()); + details.push("NEXT STEPS (manual):".to_string()); + details.push( + " The dev machine that ran this Score is no longer on the firewall's".to_string(), + ); + details.push( + " subnet. Reconnect into the new LAN (renew DHCP or set a static IP)".to_string(), + ); + details.push(" before running the next Score against this firewall.".to_string()); + } + + Ok(Outcome::success_with_details( + format!( + "OPNsense bootstrapped — web UI at https://{final_ip}:{}", + self.score.target_api_port + ), + details, + )) + } + + fn get_name(&self) -> InterpretName { + InterpretName::OPNsenseBootstrap + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::QUEUED + } + + fn get_children(&self) -> Vec { + vec![] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_name() { + let s = OPNsenseBootstrapScore::default(); + assert_eq!( + >::name(&s), + "OPNsenseBootstrapScore" + ); + } + + #[test] + fn test_score_serializes() { + let s = OPNsenseBootstrapScore::default(); + let _: serde_value::Value = + serde_value::to_value(&s).expect("OPNsenseBootstrapScore should serialize"); + } + + #[test] + fn test_decide_noop_when_target_up_creds_present_vanilla_gone() { + assert_eq!(decide(true, true, false, true), Decision::Noop); + } + + #[test] + fn test_decide_dance_on_clean_first_run() { + // No creds yet, vanilla reachable. + assert_eq!(decide(false, false, true, false), Decision::Dance); + assert_eq!(decide(false, false, true, true), Decision::Dance); + } + + #[test] + fn test_decide_dance_when_resuming_with_creds() { + // Creds present, vanilla still answering → LAN rebind didn't happen. + assert_eq!(decide(true, true, true, true), Decision::Dance); + assert_eq!(decide(true, true, true, false), Decision::Dance); + } + + #[test] + fn test_decide_failure_on_partial_creds_lost() { + for (api, ssh) in [(false, true), (true, false), (false, false)] { + match decide(api, ssh, false, true) { + Decision::Failure(m) => assert!( + m.contains("partial bootstrap"), + "expected 'partial bootstrap' in: {m}" + ), + d => panic!("expected Failure for ({api},{ssh},false,true), got {d:?}"), + } + } + } + + #[test] + fn test_decide_failure_when_nothing_reachable() { + for (api, ssh) in [(false, false), (true, true), (true, false), (false, true)] { + match decide(api, ssh, false, false) { + Decision::Failure(m) => { + assert!( + m.contains("not reachable"), + "expected 'not reachable' in: {m}" + ) + } + d => panic!("expected Failure for ({api},{ssh},false,false), got {d:?}"), + } + } + } +} diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index 1bac1f74..09aa615e 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -1,4 +1,5 @@ pub mod bootstrap; +pub mod bootstrap_score; pub mod dnat; pub mod firewall; pub mod image; -- 2.39.5 From baf15d587eba8642ba16c5916f73dd7c9dd8807e Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 11:36:07 -0400 Subject: [PATCH 03/38] refactor(opnsense-vm-integration): compose OPNsenseBootstrapScore instead of inline dance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the procedural login → abort wizard → SSH → port → API-key sequence in `boot_vm` and `run_integration`, and replace the bootstrap leg with a single `harmony_cli::run_cli` invocation of `OPNsenseBootstrapScore` against `OPNsenseBootstrapTopology`. The diagnose_via_ssh fallback and the SSH-22 polling loop go away too — both are covered by the Score's own idempotency probe and the per-step error messages the Score emits. Credentials now round-trip through `SecretManager` rather than through local variables: the Score persists `OPNSenseApiCredentials` + `OPNSenseFirewallCredentials` from `--boot` / `--full`, and `run_integration` reads them back when constructing the production `OPNSenseFirewall` topology and the typed `OpnsenseClient` used by the verification step. `SecretManager` panics on a missing `HARMONY_SECRET_NAMESPACE`, so main() sets a binary-specific default if the operator hasn't already exported one. `harmony_secret` is added as a direct dependency. No behavior change for `--check` / `--download` / `--clean` / `--status`. `--boot` and `--full` now emit `[OPNsenseBootstrap/192.168.1.1]`-prefixed log lines from the Score's Interpret. Subsequent `--boot` runs against an already-bootstrapped VM NOOP through the idempotency check instead of re-running the dance. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + examples/opnsense_vm_integration/Cargo.toml | 1 + examples/opnsense_vm_integration/src/main.rs | 105 +++++++++---------- 3 files changed, 52 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 97904ddf..95407b9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5552,6 +5552,7 @@ dependencies = [ "harmony_cli", "harmony_inventory_agent", "harmony_macros", + "harmony_secret", "harmony_types", "log", "opnsense-api", diff --git a/examples/opnsense_vm_integration/Cargo.toml b/examples/opnsense_vm_integration/Cargo.toml index ac4ec1a5..7ec3fd4a 100644 --- a/examples/opnsense_vm_integration/Cargo.toml +++ b/examples/opnsense_vm_integration/Cargo.toml @@ -13,6 +13,7 @@ harmony = { path = "../../harmony" } harmony_cli = { path = "../../harmony_cli" } harmony_inventory_agent = { path = "../../harmony_inventory_agent" } harmony_macros = { path = "../../harmony_macros" } +harmony_secret = { path = "../../harmony_secret" } harmony_types = { path = "../../harmony_types" } opnsense-api = { path = "../../opnsense-api" } opnsense-config = { path = "../../opnsense-config" } diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index b5e58ea5..e81e698d 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -31,7 +31,7 @@ use harmony::modules::kvm::{ BootDevice, ForwardMode, KvmExecutor, NetworkConfig, NetworkRef, VmConfig, }; use harmony::modules::load_balancer::LoadBalancerScore; -use harmony::modules::opnsense::bootstrap::{OPNsenseBootstrap, create_api_key_ssh}; +use harmony::modules::opnsense::bootstrap_score::OPNsenseBootstrapScore; use harmony::modules::opnsense::dnat::{DnatRuleDef, DnatScore}; use harmony::modules::opnsense::firewall::{ BinatRuleDef, BinatScore, FilterRuleDef, FirewallRuleScore, OutboundNatScore, SnatRuleDef, @@ -44,9 +44,11 @@ use harmony::modules::tftp::TftpScore; use harmony::score::Score; use harmony::topology::{ BackendServer, HealthCheck, HostBinding, HostConfig, LoadBalancerService, LogicalHost, + OPNsenseBootstrapTopology, }; use harmony_inventory_agent::hwinfo::NetworkInterface; use harmony_macros::ip; +use harmony_secret::SecretManager; use harmony_types::firewall::{ Direction, FirewallAction, IpProtocol, LaggProtocol, NetworkProtocol, VipMode, }; @@ -70,6 +72,16 @@ const OPN_API_PORT: u16 = 9443; #[tokio::main] async fn main() -> Result<(), Box> { + // `SecretManager` panics if HARMONY_SECRET_NAMESPACE is unset + // (see `harmony_secret::config`). Default it to this binary's name so + // `cargo run -p opnsense-vm-integration` works without sourcing env.sh. + if std::env::var("HARMONY_SECRET_NAMESPACE").is_err() { + // SAFETY: single-threaded at this point, no other reads/writes to env. + unsafe { + std::env::set_var("HARMONY_SECRET_NAMESPACE", "opnsense-vm-integration"); + } + } + harmony_cli::cli_logger::init(); let args: Vec = std::env::args().collect(); @@ -175,46 +187,36 @@ async fn boot_vm( wait_for_https(OPN_LAN_IP, 443).await?; - // ── Automated bootstrap (replaces manual browser interaction) ─── - info!("Bootstrapping OPNsense: login, abort wizard, enable SSH, set webgui port..."); - let bootstrap = OPNsenseBootstrap::new(&format!("https://{OPN_LAN_IP}")); - bootstrap.login("root", "opnsense").await?; - bootstrap.abort_wizard().await?; - bootstrap.enable_ssh(true, true).await?; - bootstrap - .set_webgui_port(OPN_API_PORT, OPN_LAN_IP, false) - .await?; - - // Wait for the web UI to come back on the new port - info!("Waiting for web UI on new port {OPN_API_PORT}..."); - if let Err(e) = OPNsenseBootstrap::wait_for_ready( - &format!("https://{OPN_LAN_IP}:{OPN_API_PORT}"), - std::time::Duration::from_secs(120), + // ── Hand off to OPNsenseBootstrapScore ────────────────────────── + // The Score owns the full dance: login → abort wizard → SSH → port + // move → API key mint → persist OPNSenseApiCredentials and + // OPNSenseFirewallCredentials to SecretManager. It's idempotent: a + // re-run against an already-bootstrapped firewall NOOPs. + let bootstrap_topology = OPNsenseBootstrapTopology { + vanilla_ip: ip!("192.168.1.1"), + default_username: "root".to_string(), + default_password: "opnsense".to_string(), + }; + let bootstrap_scores: Vec>> = + vec![Box::new(OPNsenseBootstrapScore { + target_api_port: OPN_API_PORT, + ..Default::default() + })]; + let bootstrap_args = harmony_cli::Args { + yes: true, + filter: None, + interactive: false, + all: true, + number: 0, + list: false, + }; + harmony_cli::run_cli( + Inventory::autoload(), + bootstrap_topology, + bootstrap_scores, + bootstrap_args, ) - .await - { - warn!("Web UI did not come up on port {OPN_API_PORT}: {e}"); - info!("Running diagnostics via SSH..."); - match OPNsenseBootstrap::diagnose_via_ssh(OPN_LAN_IP).await { - Ok(report) => { - info!("Diagnostic report:\n{}", report); - } - Err(diag_err) => warn!("Diagnostics failed: {diag_err}"), - } - return Err(e.into()); - } - - // Verify SSH is reachable - info!("Verifying SSH is reachable..."); - for _ in 0..30 { - if check_tcp_port(OPN_LAN_IP, 22).await { - break; - } - tokio::time::sleep(std::time::Duration::from_secs(2)).await; - } - if !check_tcp_port(OPN_LAN_IP, 22).await { - return Err("SSH did not become reachable after bootstrap".into()); - } + .await?; println!(); println!("OPNsense VM is running and fully bootstrapped:"); @@ -245,24 +247,17 @@ async fn run_integration() -> Result<(), Box> { } info!("SSH is reachable"); - // Create API key - info!("Creating API key via SSH..."); - let (api_key, api_secret) = create_api_key_ssh(&vm_ip, "root", "opnsense").await?; - info!("API key created: {}...", &api_key[..api_key.len().min(12)]); + // Load API + SSH credentials from SecretManager. OPNsenseBootstrapScore + // (run by --boot or --full) is what writes them; if they're missing, + // the operator hasn't bootstrapped the VM yet. + let api_creds = SecretManager::get::().await?; + let ssh_creds = SecretManager::get::().await?; // Build topology let firewall_host = LogicalHost { - ip: vm_ip.into(), + ip: vm_ip, name: VM_NAME.to_string(), }; - let api_creds = OPNSenseApiCredentials { - key: api_key.clone(), - secret: api_secret.clone(), - }; - let ssh_creds = OPNSenseFirewallCredentials { - username: "root".to_string(), - password: "opnsense".to_string(), - }; let opnsense = OPNSenseFirewall::with_api_port(firewall_host, None, OPN_API_PORT, &api_creds, &ssh_creds) .await; @@ -329,7 +324,7 @@ async fn run_integration() -> Result<(), Box> { info!("Verifying all Scores via typed API..."); let client = opnsense_api::OpnsenseClient::builder() .base_url(format!("https://{OPN_LAN_IP}:{OPN_API_PORT}/api")) - .auth_from_key_secret(&api_key, &api_secret) + .auth_from_key_secret(&api_creds.key, &api_creds.secret) .skip_tls_verify() .timeout_secs(60) .build()?; @@ -342,7 +337,7 @@ async fn run_integration() -> Result<(), Box> { info!("=== IDEMPOTENCY TEST: Running all Scores a SECOND time ==="); let scores_round2 = build_all_scores()?; let firewall_host2 = LogicalHost { - ip: vm_ip.into(), + ip: vm_ip, name: VM_NAME.to_string(), }; let opnsense2 = -- 2.39.5 From 9a29a489684ecdc394f71d6b61283d558bdf06b8 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 11:42:33 -0400 Subject: [PATCH 04/38] fix(opnsense-vm-integration): default HARMONY_SECRET_STORE to file The previous refactor only defaulted HARMONY_SECRET_NAMESPACE; running --full / --boot then panicked because `init_secret_manager` falls back to the Infisical backend when HARMONY_SECRET_STORE is unset (see harmony_secret::lib:82), and that requires HARMONY_SECRET_INFISICAL_URL. Default HARMONY_SECRET_STORE to "file" the same way so `cargo run -p opnsense-vm-integration -- --full` works out of the box without sourcing an env.sh. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index e81e698d..c50d35ca 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -72,15 +72,23 @@ const OPN_API_PORT: u16 = 9443; #[tokio::main] async fn main() -> Result<(), Box> { - // `SecretManager` panics if HARMONY_SECRET_NAMESPACE is unset - // (see `harmony_secret::config`). Default it to this binary's name so - // `cargo run -p opnsense-vm-integration` works without sourcing env.sh. + // `SecretManager` panics if HARMONY_SECRET_NAMESPACE is unset, and + // defaults to the Infisical backend if HARMONY_SECRET_STORE is unset + // (see `harmony_secret::config` and `init_secret_manager` in + // `harmony_secret::lib`). Default both so `cargo run -p + // opnsense-vm-integration` works without sourcing an env.sh. if std::env::var("HARMONY_SECRET_NAMESPACE").is_err() { // SAFETY: single-threaded at this point, no other reads/writes to env. unsafe { std::env::set_var("HARMONY_SECRET_NAMESPACE", "opnsense-vm-integration"); } } + if std::env::var("HARMONY_SECRET_STORE").is_err() { + // SAFETY: same as above. + unsafe { + std::env::set_var("HARMONY_SECRET_STORE", "file"); + } + } harmony_cli::cli_logger::init(); -- 2.39.5 From 3f719f7bcd36377fe3cc80f91e6b5784c293bd7c Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 12:06:44 -0400 Subject: [PATCH 05/38] fix(opnsense-vm-integration): use firmware/status + explicit reboot, not unstable upgradestatus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware-update fallback used to poll `/api/core/firmware/upgradestatus` for a "done" signal, then wait_for_https + a 10s sleep before retrying the package install. That endpoint is documented as "known to be unstable" in OPNsense 26.1.6 release notes (the WebUI itself traps its generic error popup), so the polling loop never breaks out via the success path — it just times out. wait_for_https then succeeds during the brief window before OPNsense actually starts rebooting, and the install retry gets killed mid-reboot with a `reqwest::Request` timeout. Switch to `/api/core/firmware/status`, which is the stable endpoint and returns a definitive `status_reboot` field ('1' if a reboot is required after the in-progress update/upgrade, computed from `needs_reboot` and `upgrade_needs_reboot` per FirmwareController.php). Poll until the update finishes (status == "none") or the API becomes unreachable (auto-reboot during update), then read `status_reboot` and trigger an explicit `POST /api/core/firmware/reboot` if needed. The wait-for-unreachable window after the reboot is then a tight 60s — we know the reboot just happened. No more blind multi-minute timeouts. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 114 ++++++++++++++++--- 1 file changed, 96 insertions(+), 18 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index c50d35ca..d4581380 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -279,35 +279,113 @@ async fn run_integration() -> Result<(), Box> { Err(e) => { warn!("os-haproxy install failed: {e}"); info!("Attempting firmware update..."); - // Trigger firmware update then retry + + // Trigger the firmware update — OPNsense does + // download → install → maybe-reboot internally. The POST + // returns immediately; the actual work runs async. let _: serde_json::Value = config .client() .post_typed("core", "firmware", "update", None::<&()>) .await .map_err(|e| format!("firmware update failed: {e}"))?; - // Poll for completion - for _ in 0..120 { + + // Give the update job a moment to actually start so we don't + // observe a stale `status == "none"` on the very first poll. + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + + // Poll the stable `/api/core/firmware/status` endpoint until + // either the update finishes (`status == "none"`) or the API + // becomes unreachable (OPNsense auto-rebooted mid-update). + // The legacy `/api/core/firmware/upgradestatus` endpoint is + // marked "known to be unstable" in the 26.1.6 release notes + // (the WebUI even traps its generic error popup), so we don't + // touch it. + info!("Polling /api/core/firmware/status for update completion ..."); + let poll_deadline = std::time::Instant::now() + std::time::Duration::from_secs(600); + let mut needs_reboot = false; + let mut rebooted_during_update = false; + loop { + if std::time::Instant::now() >= poll_deadline { + return Err("firmware update did not complete within 10 minutes".into()); + } tokio::time::sleep(std::time::Duration::from_secs(5)).await; - let status: serde_json::Value = match config + match config .client() - .get_typed("core", "firmware", "upgradestatus") + .get_typed::("core", "firmware", "status") .await { - Ok(s) => s, - Err(_) => continue, // VM may be rebooting - }; - if status["status"].as_str() == Some("done") - || status["status"].as_str() == Some("reboot") - { - break; + Ok(s) => { + let active = s["status"].as_str().unwrap_or(""); + if active.is_empty() || active == "none" { + needs_reboot = s["status_reboot"].as_str() == Some("1"); + info!( + "Firmware update finished (status={active:?}, \ + needs_reboot={needs_reboot})" + ); + break; + } + info!("...firmware update in progress (status={active})"); + } + Err(_) => { + info!( + "API unreachable while update was in progress — \ + OPNsense auto-rebooted" + ); + rebooted_during_update = true; + break; + } } } - info!("Firmware updated, retrying package install..."); - // Wait for API to come back — try configured port first - // (config.xml persists across reboots, so port stays at 9443) - wait_for_https(OPN_LAN_IP, OPN_API_PORT).await?; - // Extra settle time — web UI responds before API backend is ready - tokio::time::sleep(std::time::Duration::from_secs(10)).await; + + // If the update reports it needs a reboot and OPNsense didn't + // auto-reboot, issue an explicit reboot. Fire-and-forget: the + // server tears down its TCP connection while replying, so an + // Err here is expected and harmless. + if needs_reboot && !rebooted_during_update { + info!("Triggering explicit reboot via /api/core/firmware/reboot ..."); + let _ = config + .client() + .post_typed::("core", "firmware", "reboot", None) + .await; + rebooted_during_update = true; + } + + if rebooted_during_update { + // We know a reboot just happened. Tight 60s window for + // the API to actually go down. + info!("Waiting for the API to go unreachable ..."); + let unreach_deadline = + std::time::Instant::now() + std::time::Duration::from_secs(60); + let mut went_down = false; + while std::time::Instant::now() < unreach_deadline { + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + if !check_tcp_port(OPN_LAN_IP, OPN_API_PORT).await { + info!("API unreachable — reboot in progress"); + went_down = true; + break; + } + } + if !went_down { + return Err( + "reboot was triggered but the API never went unreachable".into() + ); + } + + // Wait for the API to come back. `config.xml` persists + // across reboots so the port stays at 9443. + info!( + "Waiting for OPNsense to come back at \ + https://{OPN_LAN_IP}:{OPN_API_PORT} ..." + ); + wait_for_https(OPN_LAN_IP, OPN_API_PORT).await?; + + // Settle — the TLS handshake completes before configd / + // the MVC backend are fully initialized. + info!("Web UI reachable; giving backend services 30s to settle ..."); + tokio::time::sleep(std::time::Duration::from_secs(30)).await; + } + + info!("Retrying os-haproxy install..."); config.install_package("os-haproxy").await?; } } -- 2.39.5 From 27a4492e9a1d89257ba3bc789151319deeaf35db Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 13:11:32 -0400 Subject: [PATCH 06/38] fix(opnsense-config): poll firmware/info, not the unstable upgradestatus, in install_package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy polling loop in `Config::install_package` hit `/api/core/firmware/upgradestatus` until `status == "done"`. That endpoint is marked "known to be unstable" in OPNsense 26.1.6 release notes (the WebUI itself traps its generic error popup) and reliably 404s on a freshly bootstrapped 26.1 system. The loop's error handling used `.map_err(Error::Api)?` so a single 404 short-circuited the whole install — even when the underlying install_package POST succeeded. Switch to polling `/api/core/firmware/info` and looking for the package in the response with `installed == "1"`. That's the same check the existing code did AFTER the loop; moving it INTO the loop removes the dependency on `upgradestatus` entirely. Transient errors from the firmware/info call are now logged at debug! and tolerated as "keep polling" (the API may briefly be unreachable if a package install triggers a reboot — extremely rare for plugins, but defensible to handle). The unused `UpgradeStatus` struct is dropped along with the legacy loop. Behavior on success is identical (returns Ok, same info! log). On timeout the error message is more descriptive (`"did not appear as installed within 360 seconds"`) than the previous `"installation did not complete successfully"` which was actually printed for both "polling timed out" and "the package isn't in firmware/info" cases. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-config/src/config/config.rs | 83 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 8d27835d..568c40b2 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -33,12 +33,6 @@ struct InstallResponse { msg_uuid: String, } -#[derive(Debug, Deserialize)] -struct UpgradeStatus { - #[serde(default)] - status: String, -} - impl Config { /// Create a new Config from an existing API client and SSH shell. pub fn new(client: OpnsenseClient, shell: Arc) -> Self { @@ -177,8 +171,8 @@ impl Config { /// Install an OPNsense plugin package via the firmware API. /// - /// Triggers the install, polls for completion, and verifies the package - /// is listed as installed. + /// Triggers the install, then polls `/api/core/firmware/info` until the + /// package shows up as installed (or the timeout fires). pub async fn install_package(&self, package_name: &str) -> Result<(), Error> { info!("Installing OPNsense package {package_name}"); @@ -205,44 +199,51 @@ impl Config { resp.msg_uuid ); - // Poll for completion - for _ in 0..120 { - tokio::time::sleep(std::time::Duration::from_secs(3)).await; - let status: UpgradeStatus = self + // Poll the ground-truth signal: `/api/core/firmware/info` lists every + // package with `installed == "1"` once OPNsense has finished applying + // the install. The legacy approach polled `/api/core/firmware/upgradestatus`, + // which OPNsense's own 26.1.6 release notes mark as "known to be + // unstable" (the WebUI traps its generic error popup). Polling + // `firmware/info` removes that dependency entirely and lets us + // tolerate transient API errors (e.g. if the install transiently + // takes the API offline). + let poll_interval = std::time::Duration::from_secs(3); + let max_attempts = 120; // 6 minutes + for attempt in 0..max_attempts { + tokio::time::sleep(poll_interval).await; + match self .client - .get_typed("core", "firmware", "upgradestatus") + .get_typed::("core", "firmware", "info") .await - .map_err(Error::Api)?; - - if status.status == "done" { - break; + { + Ok(info) => { + let installed = info["package"] + .as_array() + .and_then(|pkgs| { + pkgs.iter() + .find(|p| p["name"].as_str() == Some(package_name)) + }) + .and_then(|p| p["installed"].as_str()) + == Some("1"); + if installed { + info!("Package {package_name} installed successfully"); + return Ok(()); + } + } + Err(e) => { + debug!( + "firmware/info poll attempt {attempt} returned transient error: {e}; retrying" + ); + } } } - // Verify installation - let info: serde_json::Value = self - .client - .get_typed("core", "firmware", "info") - .await - .map_err(Error::Api)?; - - let installed = info["package"] - .as_array() - .and_then(|pkgs| { - pkgs.iter() - .find(|p| p["name"].as_str() == Some(package_name)) - }) - .and_then(|p| p["installed"].as_str()) - == Some("1"); - - if installed { - info!("Package {package_name} installed successfully"); - Ok(()) - } else { - let msg = format!("Package {package_name} installation did not complete successfully"); - warn!("{msg}"); - Err(Error::PackageInstall(msg)) - } + let msg = format!( + "Package {package_name} did not appear as installed within {} seconds", + max_attempts * poll_interval.as_secs() + ); + warn!("{msg}"); + Err(Error::PackageInstall(msg)) } /// Check if a package is installed via the firmware API. -- 2.39.5 From a0401ca6c4325d6082cf61743d95742489576659 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 13:12:39 -0400 Subject: [PATCH 07/38] refactor(opnsense-vm-integration): collapse firmware-update fallback now that install_package is resilient MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After `fix(opnsense-config): poll firmware/info, not the unstable upgradestatus, in install_package`, the library's install_package call handles its own polling correctly and tolerates transient API errors. So the example's fallback no longer needs to track firmware/status, issue an explicit reboot, or wait for an unreachable→reachable cycle — all of that logic was duplicating what should be (and now is) the library's responsibility. Collapse the ~100-line Err arm to ~15 lines: when the first install_package attempt fails, kick `firmware/update` (== `pkg update`, refresh repository metadata), sleep 5s, and retry. The original failure mode (first install fails because a freshly bootstrapped firewall has no pkg metadata yet) is what this fallback exists to address; nothing more. Net deletion of ~85 lines from the example. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 111 ++----------------- 1 file changed, 11 insertions(+), 100 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index d4581380..6f539a27 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -278,112 +278,23 @@ async fn run_integration() -> Result<(), Box> { Ok(()) => info!("os-haproxy installed"), Err(e) => { warn!("os-haproxy install failed: {e}"); - info!("Attempting firmware update..."); + info!("Refreshing package metadata via firmware/update, then retrying..."); - // Trigger the firmware update — OPNsense does - // download → install → maybe-reboot internally. The POST - // returns immediately; the actual work runs async. + // `firmware/update` is OPNsense's API hook for `pkg update` + // (refresh repository metadata). The first install attempt + // typically fails on a freshly bootstrapped firewall because + // the package index hasn't been pulled yet; this kicks it. let _: serde_json::Value = config .client() .post_typed("core", "firmware", "update", None::<&()>) .await - .map_err(|e| format!("firmware update failed: {e}"))?; + .map_err(|e| format!("firmware/update failed: {e}"))?; - // Give the update job a moment to actually start so we don't - // observe a stale `status == "none"` on the very first poll. - tokio::time::sleep(std::time::Duration::from_secs(3)).await; - - // Poll the stable `/api/core/firmware/status` endpoint until - // either the update finishes (`status == "none"`) or the API - // becomes unreachable (OPNsense auto-rebooted mid-update). - // The legacy `/api/core/firmware/upgradestatus` endpoint is - // marked "known to be unstable" in the 26.1.6 release notes - // (the WebUI even traps its generic error popup), so we don't - // touch it. - info!("Polling /api/core/firmware/status for update completion ..."); - let poll_deadline = std::time::Instant::now() + std::time::Duration::from_secs(600); - let mut needs_reboot = false; - let mut rebooted_during_update = false; - loop { - if std::time::Instant::now() >= poll_deadline { - return Err("firmware update did not complete within 10 minutes".into()); - } - tokio::time::sleep(std::time::Duration::from_secs(5)).await; - match config - .client() - .get_typed::("core", "firmware", "status") - .await - { - Ok(s) => { - let active = s["status"].as_str().unwrap_or(""); - if active.is_empty() || active == "none" { - needs_reboot = s["status_reboot"].as_str() == Some("1"); - info!( - "Firmware update finished (status={active:?}, \ - needs_reboot={needs_reboot})" - ); - break; - } - info!("...firmware update in progress (status={active})"); - } - Err(_) => { - info!( - "API unreachable while update was in progress — \ - OPNsense auto-rebooted" - ); - rebooted_during_update = true; - break; - } - } - } - - // If the update reports it needs a reboot and OPNsense didn't - // auto-reboot, issue an explicit reboot. Fire-and-forget: the - // server tears down its TCP connection while replying, so an - // Err here is expected and harmless. - if needs_reboot && !rebooted_during_update { - info!("Triggering explicit reboot via /api/core/firmware/reboot ..."); - let _ = config - .client() - .post_typed::("core", "firmware", "reboot", None) - .await; - rebooted_during_update = true; - } - - if rebooted_during_update { - // We know a reboot just happened. Tight 60s window for - // the API to actually go down. - info!("Waiting for the API to go unreachable ..."); - let unreach_deadline = - std::time::Instant::now() + std::time::Duration::from_secs(60); - let mut went_down = false; - while std::time::Instant::now() < unreach_deadline { - tokio::time::sleep(std::time::Duration::from_secs(2)).await; - if !check_tcp_port(OPN_LAN_IP, OPN_API_PORT).await { - info!("API unreachable — reboot in progress"); - went_down = true; - break; - } - } - if !went_down { - return Err( - "reboot was triggered but the API never went unreachable".into() - ); - } - - // Wait for the API to come back. `config.xml` persists - // across reboots so the port stays at 9443. - info!( - "Waiting for OPNsense to come back at \ - https://{OPN_LAN_IP}:{OPN_API_PORT} ..." - ); - wait_for_https(OPN_LAN_IP, OPN_API_PORT).await?; - - // Settle — the TLS handshake completes before configd / - // the MVC backend are fully initialized. - info!("Web UI reachable; giving backend services 30s to settle ..."); - tokio::time::sleep(std::time::Duration::from_secs(30)).await; - } + // Brief sleep for the metadata refresh to actually run. + // `install_package` itself is resilient to transient API + // errors (it polls `firmware/info` with retry tolerance), + // so we don't need to track reboot state here. + tokio::time::sleep(std::time::Duration::from_secs(5)).await; info!("Retrying os-haproxy install..."); config.install_package("os-haproxy").await?; -- 2.39.5 From 9e0224264cdc41c8afc480992358cea480306c06 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 13:33:23 -0400 Subject: [PATCH 08/38] feat(opnsense): OPNsenseFirmwareUpgradeScore + bake into bootstrap by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `Score` that brings an OPNsense firewall to the latest available firmware/package level via the REST API: POST firmware/check → refresh upstream metadata GET firmware/status → check what's actionable POST firmware/upgrade → trigger if anything is pending poll firmware/status → wait for status to return to "none" POST firmware/reboot → if status_reboot == "1" and we haven't already auto-rebooted mid-upgrade + wait unreachable → reachable → 30s settle The core logic lives in `perform_firmware_upgrade()` so it can be called from elsewhere. `OPNsenseBootstrapScore` now exposes `upgrade_firmware: bool` (default `true`) and calls the same helper after credentials are persisted, before any optional LAN rebind. The firewall thus ends bootstrap on its latest firmware, exactly the right beat operationally: no production traffic yet, operator already babysitting, all subsequent Scores run against current code. Why not extend `OPNSenseLaunchUpgrade` (the existing SSH-based Score)? It calls a shell script (`opnsense-update.sh`), has a `todo!()` Serialize impl, no idempotency check, and holds an `Arc` directly instead of reading from a topology. The new score uses the REST API end-to-end, idempotency-checks via `status_upgrade_action`, and slots cleanly into normal Score composition. `OPNSenseLaunchUpgrade` stays alongside it for now; affilium2 keeps working unchanged. We can deprecate the SSH one in a follow-up once the API one has flown against real firewalls. `opnsense_vm_integration` explicitly opts out of the bootstrap-time upgrade (`upgrade_firmware: false`) — the VM image is a known firmware version, and we don't want each integration run to spend 10+ minutes pulling firmware updates. New `InterpretName::OPNsenseFirmwareUpgrade` variant. Unit tests cover score name, default api_port (9443), and serialization. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 3 + harmony/src/domain/interpret/mod.rs | 2 + .../src/modules/opnsense/bootstrap_score.rs | 45 ++- .../src/modules/opnsense/firmware_upgrade.rs | 333 ++++++++++++++++++ harmony/src/modules/opnsense/mod.rs | 1 + 5 files changed, 382 insertions(+), 2 deletions(-) create mode 100644 harmony/src/modules/opnsense/firmware_upgrade.rs diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 6f539a27..e28f586b 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -208,6 +208,9 @@ async fn boot_vm( let bootstrap_scores: Vec>> = vec![Box::new(OPNsenseBootstrapScore { target_api_port: OPN_API_PORT, + // The VM image is a known firmware version; we don't want + // each integration run to spend 10+ minutes upgrading. + upgrade_firmware: false, ..Default::default() })]; let bootstrap_args = harmony_cli::Args { diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index 73ef3870..76c88d35 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -12,6 +12,7 @@ pub enum InterpretName { OPNSenseDHCP, OPNSenseDns, OPNsenseBootstrap, + OPNsenseFirmwareUpgrade, LoadBalancer, Tftp, Http, @@ -46,6 +47,7 @@ impl std::fmt::Display for InterpretName { InterpretName::OPNSenseDHCP => f.write_str("OPNSenseDHCP"), InterpretName::OPNSenseDns => f.write_str("OPNSenseDns"), InterpretName::OPNsenseBootstrap => f.write_str("OPNsenseBootstrap"), + InterpretName::OPNsenseFirmwareUpgrade => f.write_str("OPNsenseFirmwareUpgrade"), InterpretName::LoadBalancer => f.write_str("LoadBalancer"), InterpretName::Tftp => f.write_str("Tftp"), InterpretName::Http => f.write_str("Http"), diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 53164e40..9d77218f 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -10,7 +10,10 @@ //! 3. SSHes in, mints an API key + secret on the root user, and persists //! both `OPNSenseApiCredentials` and `OPNSenseFirewallCredentials` to //! `harmony_secret::SecretManager`. -//! 4. Optionally rebinds the LAN to a new IP/subnet. +//! 4. (Default-on, via `upgrade_firmware`) Brings the firewall up to the +//! latest firmware/package level using the same logic as +//! [`OPNsenseFirmwareUpgradeScore`](crate::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore). +//! 5. Optionally rebinds the LAN to a new IP/subnet. //! //! After it runs, callers construct a normal //! [`OPNSenseFirewall`](crate::infra::opnsense::OPNSenseFirewall) from the @@ -39,6 +42,7 @@ use crate::{ modules::opnsense::bootstrap::{ OPNsenseBootstrap, change_lan_ip_via_ssh, create_api_key_ssh, probe_https, }, + modules::opnsense::firmware_upgrade::perform_firmware_upgrade, score::Score, topology::OPNsenseBootstrapTopology, }; @@ -66,6 +70,13 @@ pub struct OPNsenseBootstrapScore { /// Required when something else needs to bind `0.0.0.0:80` (e.g. /// HAProxy on a CARP VIP). pub disable_http_redirect: bool, + /// If `true` (default), bring the firewall to the latest available + /// firmware/package level immediately after credentials are persisted + /// and before any optional LAN rebind. Set to `false` for VM + /// integration tests, air-gapped environments, or pinned-version + /// deployments. The underlying logic lives in + /// [`crate::modules::opnsense::firmware_upgrade::perform_firmware_upgrade`]. + pub upgrade_firmware: bool, } impl Default for OPNsenseBootstrapScore { @@ -75,6 +86,7 @@ impl Default for OPNsenseBootstrapScore { target_lan: None, webgui_ready_timeout: std::time::Duration::from_secs(120), disable_http_redirect: false, + upgrade_firmware: true, } } } @@ -282,7 +294,36 @@ impl Interpret for OPNsenseBootstrapInterpret { .await?; info!("{tag} Persisted OPNSenseApiCredentials + OPNSenseFirewallCredentials"); - // ── Step 4: optional LAN rebind ────────────────────────────── + // ── Step 4 (optional): firmware upgrade ────────────────────── + // Runs BEFORE the LAN rebind so the upgrade (which may reboot) + // happens against `vanilla_ip` — known reachable from here. The + // firewall will come back at `vanilla_ip:target_api_port`, then + // the rebind moves it onward. + if self.score.upgrade_firmware { + info!("{tag} Upgrading firmware to latest before optional LAN rebind ..."); + let client = opnsense_api::OpnsenseClient::builder() + .base_url(format!( + "https://{vanilla_ip}:{}/api", + self.score.target_api_port + )) + .auth_from_key_secret(&key, &secret) + .skip_tls_verify() + .timeout_secs(60) + .build() + .map_err(|e| { + InterpretError::new(format!( + "Failed to build OPNsense client for firmware upgrade: {e}" + )) + })?; + let outcome = + perform_firmware_upgrade(&client, &vanilla_ip, self.score.target_api_port, &tag) + .await?; + info!("{tag} Firmware upgrade outcome: {}", outcome.message); + } else { + info!("{tag} upgrade_firmware=false; skipping firmware upgrade"); + } + + // ── Step 5: optional LAN rebind ────────────────────────────── if let Some(rebind) = &self.score.target_lan { info!( "{tag} LAN rebind {vanilla_ip} -> {}/{}", diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs new file mode 100644 index 00000000..4346a6f1 --- /dev/null +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -0,0 +1,333 @@ +//! `OPNsenseFirmwareUpgradeScore` — bring an OPNsense firewall to the latest +//! firmware/package level via the REST API. +//! +//! The flow mirrors what OPNsense's web UI does when an operator clicks +//! "Update": refresh upstream metadata (`firmware/check`), read +//! `firmware/status` to find out what's actionable, kick `firmware/upgrade`, +//! poll until it's done, and trigger an explicit reboot if one is required. +//! +//! The core logic is a free function ([`perform_firmware_upgrade`]) so it +//! can be reused from elsewhere in the framework — notably from +//! [`OPNsenseBootstrapScore`](crate::modules::opnsense::bootstrap_score::OPNsenseBootstrapScore) +//! when its `upgrade_firmware` knob is set. +//! +//! Idempotent: when nothing is pending, the helper short-circuits with +//! `UpgradeOutcome { upgraded: false, .. }`. + +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use harmony_types::id::Id; +use log::{debug, info}; +use opnsense_api::OpnsenseClient; +use serde::Serialize; +use thiserror::Error; + +use crate::{ + data::Version, + infra::opnsense::OPNSenseFirewall, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::opnsense::bootstrap::probe_https, + score::Score, +}; + +/// Errors the firmware-upgrade helper may surface. +#[derive(Debug, Error)] +pub enum FirmwareUpgradeError { + #[error("OPNsense API error during {phase}: {msg}")] + Api { phase: &'static str, msg: String }, + #[error("Timed out: {0}")] + Timeout(String), + #[error("Reboot was triggered but the API never went unreachable within 60s")] + RebootDidNotTake, +} + +impl From for InterpretError { + fn from(e: FirmwareUpgradeError) -> Self { + InterpretError::new(format!("Firmware upgrade failed: {e}")) + } +} + +/// What [`perform_firmware_upgrade`] actually did. +#[derive(Debug, Clone)] +pub struct UpgradeOutcome { + /// `true` if OPNsense reported something to upgrade and we triggered it. + pub upgraded: bool, + /// `true` if the firewall rebooted as part of the upgrade. + pub rebooted: bool, + /// Human-readable summary suitable for log lines / Score `Outcome`. + pub message: String, +} + +/// Bring an OPNsense firewall to the latest firmware/package level. +/// +/// `firewall_ip` and `api_port` are needed for the post-reboot reachability +/// probes — the `OpnsenseClient` already knows them, but it doesn't expose +/// them via a public accessor. `tag` is a short identifier (e.g. an IP) that +/// gets prefixed to every log line so this helper can be called from +/// multiple contexts without making logs ambiguous. +pub async fn perform_firmware_upgrade( + client: &OpnsenseClient, + firewall_ip: &str, + api_port: u16, + tag: &str, +) -> Result { + info!("{tag} Refreshing firmware metadata (firmware/check) ..."); + let _: serde_json::Value = client + .post_typed::("core", "firmware", "check", None) + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/check", + msg: e.to_string(), + })?; + + // The check kicks an async metadata refresh. Give it a moment to settle + // before reading status, otherwise we may observe a stale state. + tokio::time::sleep(Duration::from_secs(5)).await; + + let status: serde_json::Value = client + .get_typed("core", "firmware", "status") + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/status (pre-upgrade)", + msg: e.to_string(), + })?; + let initial_msg = status["status_msg"].as_str().unwrap_or("").to_string(); + let upgrade_action = status["status_upgrade_action"] + .as_str() + .unwrap_or("") + .to_string(); + info!( + "{tag} firmware/status: \ + status_upgrade_action={upgrade_action:?}, \ + status_msg={initial_msg:?}" + ); + + // OPNsense reports `status_upgrade_action` empty (or absent) when + // nothing is pending. Fall back to scanning `status_msg` for known + // "you're current" phrasings in case the field doesn't exist on some + // version. + let pending = if !upgrade_action.is_empty() && upgrade_action != "0" { + true + } else { + let m = initial_msg.to_lowercase(); + !(m.contains("up to date") || m.contains("no updates") || m.contains("no update")) + }; + + if !pending { + return Ok(UpgradeOutcome { + upgraded: false, + rebooted: false, + message: format!("Nothing to upgrade: {initial_msg}"), + }); + } + + info!("{tag} Triggering firmware upgrade (firmware/upgrade) ..."); + let _: serde_json::Value = client + .post_typed::("core", "firmware", "upgrade", None) + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/upgrade", + msg: e.to_string(), + })?; + + // Let the upgrade job actually start so we don't observe a stale + // status == "none" on the first poll. + tokio::time::sleep(Duration::from_secs(3)).await; + + info!("{tag} Polling firmware/status until the upgrade completes ..."); + // Firmware downloads + apply can be slow on a small VM or a slow link. + // 20 minutes is the same ceiling the OPNsense UI uses. + let poll_deadline = Instant::now() + Duration::from_secs(1200); + let mut needs_reboot = false; + let mut rebooted = false; + loop { + if Instant::now() >= poll_deadline { + return Err(FirmwareUpgradeError::Timeout( + "firmware/status did not return to \"none\" within 20 minutes".into(), + )); + } + tokio::time::sleep(Duration::from_secs(5)).await; + match client + .get_typed::("core", "firmware", "status") + .await + { + Ok(s) => { + let active = s["status"].as_str().unwrap_or(""); + if active.is_empty() || active == "none" { + needs_reboot = s["status_reboot"].as_str() == Some("1"); + info!( + "{tag} Upgrade finished (status={active:?}, needs_reboot={needs_reboot})" + ); + break; + } + debug!("{tag} ...upgrade in progress (status={active})"); + } + Err(_) => { + info!("{tag} API unreachable mid-upgrade — OPNsense auto-rebooted"); + rebooted = true; + break; + } + } + } + + if needs_reboot && !rebooted { + info!("{tag} Triggering explicit reboot via firmware/reboot ..."); + // Fire-and-forget — the server tears down its TCP connection while + // replying. An Err here is expected and harmless. + let _ = client + .post_typed::("core", "firmware", "reboot", None) + .await; + rebooted = true; + } + + if rebooted { + info!("{tag} Waiting for the API to go unreachable (reboot in flight) ..."); + let unreach_deadline = Instant::now() + Duration::from_secs(60); + let mut went_down = false; + while Instant::now() < unreach_deadline { + tokio::time::sleep(Duration::from_secs(2)).await; + if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + info!("{tag} API unreachable — reboot in progress"); + went_down = true; + break; + } + } + if !went_down { + return Err(FirmwareUpgradeError::RebootDidNotTake); + } + + info!("{tag} Waiting for OPNsense to come back at https://{firewall_ip}:{api_port} ..."); + let back_deadline = Instant::now() + Duration::from_secs(600); + let mut came_back = false; + while Instant::now() < back_deadline { + tokio::time::sleep(Duration::from_secs(5)).await; + if probe_https(firewall_ip, api_port, Duration::from_secs(5)).await { + came_back = true; + break; + } + } + if !came_back { + return Err(FirmwareUpgradeError::Timeout(format!( + "OPNsense did not come back at https://{firewall_ip}:{api_port} within 10 minutes" + ))); + } + + info!("{tag} Web UI reachable; giving backend services 30s to settle ..."); + tokio::time::sleep(Duration::from_secs(30)).await; + } else { + debug!("{tag} Upgrade finished without a reboot"); + } + + Ok(UpgradeOutcome { + upgraded: true, + rebooted, + message: if rebooted { + "Firmware upgraded (reboot applied)".into() + } else { + "Firmware upgraded (no reboot required)".into() + }, + }) +} + +/// Bring an already-bootstrapped OPNsense firewall to the latest firmware. +/// +/// Compose this Score right after `OPNsenseBootstrapScore` if you want +/// fine-grained control of the upgrade beat. If you're happy with the +/// default behavior, leave `OPNsenseBootstrapScore::upgrade_firmware` at +/// `true` instead — it calls the same helper internally. +#[derive(Debug, Clone, Serialize)] +pub struct OPNsenseFirmwareUpgradeScore { + /// HTTPS port the firewall's web GUI / API listens on. The default + /// (9443) matches the value `OPNsenseBootstrapScore` moves the GUI to. + pub api_port: u16, +} + +impl Default for OPNsenseFirmwareUpgradeScore { + fn default() -> Self { + Self { api_port: 9443 } + } +} + +impl Score for OPNsenseFirmwareUpgradeScore { + fn name(&self) -> String { + "OPNsenseFirmwareUpgradeScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OPNsenseFirmwareUpgradeInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug)] +struct OPNsenseFirmwareUpgradeInterpret { + score: OPNsenseFirmwareUpgradeScore, +} + +#[async_trait] +impl Interpret for OPNsenseFirmwareUpgradeInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &OPNSenseFirewall, + ) -> Result { + let firewall_ip = topology.get_ip().to_string(); + let tag = format!("[OPNsenseFirmwareUpgrade/{firewall_ip}]"); + let config = topology.get_opnsense_config(); + + let outcome = + perform_firmware_upgrade(config.client(), &firewall_ip, self.score.api_port, &tag) + .await?; + + if outcome.upgraded { + Ok(Outcome::success(outcome.message)) + } else { + Ok(Outcome::noop(outcome.message)) + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::OPNsenseFirmwareUpgrade + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::QUEUED + } + + fn get_children(&self) -> Vec { + vec![] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_name() { + let s = OPNsenseFirmwareUpgradeScore::default(); + assert_eq!( + >::name(&s), + "OPNsenseFirmwareUpgradeScore" + ); + } + + #[test] + fn test_score_default_api_port_is_9443() { + assert_eq!(OPNsenseFirmwareUpgradeScore::default().api_port, 9443); + } + + #[test] + fn test_score_serializes() { + let s = OPNsenseFirmwareUpgradeScore::default(); + let _: serde_value::Value = + serde_value::to_value(&s).expect("OPNsenseFirmwareUpgradeScore should serialize"); + } +} diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index 09aa615e..47fd3c3d 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -2,6 +2,7 @@ pub mod bootstrap; pub mod bootstrap_score; pub mod dnat; pub mod firewall; +pub mod firmware_upgrade; pub mod image; pub mod lagg; pub mod node_exporter; -- 2.39.5 From 89f7455399f35e53363048a7c2096462a12125c9 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 13:59:29 -0400 Subject: [PATCH 09/38] fix(opnsense): rewrite perform_firmware_upgrade per OPNsense's actual async API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous version did `POST firmware/check`, slept 5s, read `firmware/status`, and used a fragile "status_msg contains 'up to date'" heuristic to decide whether to upgrade. Two related bugs: 1. `firmware/check` is async — it returns immediately with a msg_uuid and runs in the background. 5s is far less than the metadata refresh takes; on a fresh boot the status still reads "requires to check for update first to provide more information" when we look. 2. That message doesn't match any of my "up to date" keywords, so my pending-check returned true and triggered `firmware/upgrade` against a system that had no actionable upgrade plan. firmware/upgrade returned immediately (also async), status stayed "none", and the helper reported success without anything having happened. Rewrite per OPNsense's actual API (verified against FirmwareController.php and firmware.volt): 1. GET firmware/info → capture initial product_version 2. Loop ≤ 5 iterations (a kernel upgrade can unlock further package updates that need their own pass): a. POST firmware/check (async) b. Poll firmware/upgradestatus until status == "done" c. POST firmware/status → read `status` enum ("none"/"update"/"upgrade"/"error") d. If "none": done. (First iteration → NOOP. Later → success.) e. If "update" / "upgrade": POST firmware/{that}, poll upgradestatus until done, handle reboot (auto-reboot or explicit firmware/reboot if status_reboot == "1"), then GET firmware/info to verify product_version changed. 3. Return UpgradeOutcome with initial_version, final_version, iterations, rebooted flag. The upgrade Score's Outcome now reports the actual version transition ("Firmware upgraded: 25.7.4 → 26.1.6 in 2 iteration(s) (rebooted: true)"). Mid-upgrade reboots are detected by `upgradestatus` going unreachable + a TCP probe confirming the API is down (vs. just a 404 from the documented-unstable endpoint). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/opnsense/firmware_upgrade.rs | 475 ++++++++++++------ 1 file changed, 324 insertions(+), 151 deletions(-) diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 4346a6f1..46348119 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -2,17 +2,21 @@ //! firmware/package level via the REST API. //! //! The flow mirrors what OPNsense's web UI does when an operator clicks -//! "Update": refresh upstream metadata (`firmware/check`), read -//! `firmware/status` to find out what's actionable, kick `firmware/upgrade`, -//! poll until it's done, and trigger an explicit reboot if one is required. +//! "Check for updates", then "Update": kick `firmware/check` (async), poll +//! `firmware/upgradestatus` until the check reports `"done"`, read +//! `firmware/status` to see what's actionable, kick `firmware/update` or +//! `firmware/upgrade` (also async), poll `upgradestatus` until done, trigger +//! `firmware/reboot` if `status_reboot == "1"`, verify the version actually +//! moved, and loop in case the upgrade revealed further pending updates. //! //! The core logic is a free function ([`perform_firmware_upgrade`]) so it //! can be reused from elsewhere in the framework — notably from //! [`OPNsenseBootstrapScore`](crate::modules::opnsense::bootstrap_score::OPNsenseBootstrapScore) //! when its `upgrade_firmware` knob is set. //! -//! Idempotent: when nothing is pending, the helper short-circuits with -//! `UpgradeOutcome { upgraded: false, .. }`. +//! Idempotent: when nothing is pending on the first iteration, the helper +//! returns `UpgradeOutcome { upgraded: false, .. }` with the same version +//! before and after. use std::time::{Duration, Instant}; @@ -32,6 +36,17 @@ use crate::{ score::Score, }; +/// Maximum loop iterations. A single upgrade can sometimes reveal more +/// pending packages (e.g. the kernel upgrade unlocks new plugin versions), +/// so the helper loops; 5 is a sanity ceiling. +const MAX_UPGRADE_ITERATIONS: u32 = 5; + +/// How long to wait for an async firmware task to report `"done"`. +const TASK_DONE_TIMEOUT: Duration = Duration::from_secs(1200); + +/// How long to wait for the API to come back after a reboot. +const REBOOT_RECOVERY_TIMEOUT: Duration = Duration::from_secs(600); + /// Errors the firmware-upgrade helper may surface. #[derive(Debug, Error)] pub enum FirmwareUpgradeError { @@ -39,8 +54,12 @@ pub enum FirmwareUpgradeError { Api { phase: &'static str, msg: String }, #[error("Timed out: {0}")] Timeout(String), - #[error("Reboot was triggered but the API never went unreachable within 60s")] - RebootDidNotTake, + #[error("Firmware status reports error: {0}")] + FirmwareErrorState(String), + #[error("Unexpected firmware status: {0}")] + UnexpectedStatus(String), + #[error("Reached max upgrade iterations ({0}); firmware may have further pending updates")] + TooManyIterations(u32), } impl From for InterpretError { @@ -52,183 +71,329 @@ impl From for InterpretError { /// What [`perform_firmware_upgrade`] actually did. #[derive(Debug, Clone)] pub struct UpgradeOutcome { - /// `true` if OPNsense reported something to upgrade and we triggered it. + /// `true` if at least one update/upgrade was applied. pub upgraded: bool, - /// `true` if the firewall rebooted as part of the upgrade. + /// `true` if the firewall rebooted at least once during the upgrade. pub rebooted: bool, + /// Version reported by `firmware/info` before the first check. + pub initial_version: String, + /// Version reported by `firmware/info` after the last upgrade cycle. + pub final_version: String, + /// How many check/upgrade iterations the helper ran. + pub iterations: u32, /// Human-readable summary suitable for log lines / Score `Outcome`. pub message: String, } /// Bring an OPNsense firewall to the latest firmware/package level. /// -/// `firewall_ip` and `api_port` are needed for the post-reboot reachability -/// probes — the `OpnsenseClient` already knows them, but it doesn't expose -/// them via a public accessor. `tag` is a short identifier (e.g. an IP) that -/// gets prefixed to every log line so this helper can be called from -/// multiple contexts without making logs ambiguous. +/// `firewall_ip` and `api_port` are needed for post-reboot reachability +/// probes — the `OpnsenseClient` already knows them, but doesn't expose +/// them. `tag` is a short identifier (typically an IP) used as a log +/// prefix so this helper can be called from multiple contexts without +/// making log lines ambiguous. +/// +/// See module-level docs for the algorithm. pub async fn perform_firmware_upgrade( client: &OpnsenseClient, firewall_ip: &str, api_port: u16, tag: &str, ) -> Result { - info!("{tag} Refreshing firmware metadata (firmware/check) ..."); - let _: serde_json::Value = client - .post_typed::("core", "firmware", "check", None) - .await - .map_err(|e| FirmwareUpgradeError::Api { - phase: "firmware/check", - msg: e.to_string(), - })?; + // ── Step 1: capture the initial version ────────────────────────── + let initial_version = read_firmware_version(client).await?; + info!("{tag} Initial firmware version: {initial_version}"); - // The check kicks an async metadata refresh. Give it a moment to settle - // before reading status, otherwise we may observe a stale state. - tokio::time::sleep(Duration::from_secs(5)).await; + let mut current_version = initial_version.clone(); + let mut total_rebooted = false; + let mut iterations: u32 = 0; + let mut applied_any = false; - let status: serde_json::Value = client - .get_typed("core", "firmware", "status") - .await - .map_err(|e| FirmwareUpgradeError::Api { - phase: "firmware/status (pre-upgrade)", - msg: e.to_string(), - })?; - let initial_msg = status["status_msg"].as_str().unwrap_or("").to_string(); - let upgrade_action = status["status_upgrade_action"] - .as_str() - .unwrap_or("") - .to_string(); - info!( - "{tag} firmware/status: \ - status_upgrade_action={upgrade_action:?}, \ - status_msg={initial_msg:?}" - ); - - // OPNsense reports `status_upgrade_action` empty (or absent) when - // nothing is pending. Fall back to scanning `status_msg` for known - // "you're current" phrasings in case the field doesn't exist on some - // version. - let pending = if !upgrade_action.is_empty() && upgrade_action != "0" { - true - } else { - let m = initial_msg.to_lowercase(); - !(m.contains("up to date") || m.contains("no updates") || m.contains("no update")) - }; - - if !pending { - return Ok(UpgradeOutcome { - upgraded: false, - rebooted: false, - message: format!("Nothing to upgrade: {initial_msg}"), - }); - } - - info!("{tag} Triggering firmware upgrade (firmware/upgrade) ..."); - let _: serde_json::Value = client - .post_typed::("core", "firmware", "upgrade", None) - .await - .map_err(|e| FirmwareUpgradeError::Api { - phase: "firmware/upgrade", - msg: e.to_string(), - })?; - - // Let the upgrade job actually start so we don't observe a stale - // status == "none" on the first poll. - tokio::time::sleep(Duration::from_secs(3)).await; - - info!("{tag} Polling firmware/status until the upgrade completes ..."); - // Firmware downloads + apply can be slow on a small VM or a slow link. - // 20 minutes is the same ceiling the OPNsense UI uses. - let poll_deadline = Instant::now() + Duration::from_secs(1200); - let mut needs_reboot = false; - let mut rebooted = false; loop { - if Instant::now() >= poll_deadline { - return Err(FirmwareUpgradeError::Timeout( - "firmware/status did not return to \"none\" within 20 minutes".into(), + iterations += 1; + if iterations > MAX_UPGRADE_ITERATIONS { + return Err(FirmwareUpgradeError::TooManyIterations( + MAX_UPGRADE_ITERATIONS, )); } - tokio::time::sleep(Duration::from_secs(5)).await; + info!("{tag} ── Iteration {iterations} ──"); + + // ── Step 2: kick a check and wait for it to finish ─────────── + info!("{tag} Triggering firmware/check (async) ..."); + let _: serde_json::Value = client + .post_typed::("core", "firmware", "check", None) + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/check", + msg: e.to_string(), + })?; + wait_for_task_done(client, "check", Duration::from_secs(300), tag).await?; + + // ── Step 3: read status to see what's actionable ───────────── + let status: serde_json::Value = client + .post_typed::("core", "firmware", "status", None) + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/status", + msg: e.to_string(), + })?; + let status_kind = status["status"].as_str().unwrap_or("").to_string(); + let status_msg = status["status_msg"].as_str().unwrap_or("").to_string(); + let needs_reboot = status["status_reboot"].as_str() == Some("1"); + info!( + "{tag} firmware/status: status={status_kind:?}, status_msg={status_msg:?}, \ + status_reboot={needs_reboot}" + ); + + // ── Step 4: decide what to do ──────────────────────────────── + let action_endpoint: &'static str = match status_kind.as_str() { + "none" | "" => { + if !applied_any { + info!("{tag} No firmware updates available — already current"); + return Ok(UpgradeOutcome { + upgraded: false, + rebooted: false, + initial_version: initial_version.clone(), + final_version: current_version, + iterations, + message: format!( + "Already at latest firmware ({initial_version}); no upgrade needed" + ), + }); + } + info!("{tag} No more updates available; firmware is current"); + break; + } + "update" => "update", + "upgrade" => "upgrade", + "error" => return Err(FirmwareUpgradeError::FirmwareErrorState(status_msg)), + other => { + return Err(FirmwareUpgradeError::UnexpectedStatus(format!( + "{other:?} (status_msg: {status_msg:?})" + ))); + } + }; + + // ── Step 5: trigger the action ─────────────────────────────── + info!("{tag} Triggering firmware/{action_endpoint} (async) ..."); + let _: serde_json::Value = client + .post_typed::("core", "firmware", action_endpoint, None) + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: action_endpoint, + msg: e.to_string(), + })?; + + // ── Step 6: wait for the action to complete, possibly through + // a mid-task reboot ── + let task_outcome = + wait_for_task_or_reboot(client, action_endpoint, firewall_ip, api_port, tag).await?; + let mut rebooted_this_iter = task_outcome.rebooted; + + // ── Step 7: if a reboot is needed but didn't happen, trigger it ── + if needs_reboot && !rebooted_this_iter { + info!("{tag} status_reboot=1; triggering explicit firmware/reboot ..."); + // Fire-and-forget — the server tears down its connection while + // replying. + let _ = client + .post_typed::("core", "firmware", "reboot", None) + .await; + wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; + rebooted_this_iter = true; + } + if rebooted_this_iter { + total_rebooted = true; + } + + // ── Step 8: verify version actually moved ──────────────────── + let new_version = read_firmware_version(client).await?; + if new_version == current_version { + info!( + "{tag} Iteration {iterations} completed but version did not change: \ + {current_version}. Stopping to avoid an infinite loop." + ); + // Don't error — some "updates" change only package set without + // bumping product_version. Break out gracefully. + applied_any = true; + break; + } + info!("{tag} Iteration {iterations}: {current_version} → {new_version}"); + current_version = new_version; + applied_any = true; + + // ── Step 9: loop. Re-check; a major upgrade may have unlocked + // further package updates. + } + + let upgraded = current_version != initial_version || applied_any; + let message = if initial_version == current_version { + format!( + "Firmware upgrade completed: still on {current_version} \ + (packages refreshed; version unchanged) — {iterations} iteration(s)" + ) + } else { + format!( + "Firmware upgraded: {initial_version} → {current_version} in {iterations} iteration(s) \ + (rebooted: {total_rebooted})" + ) + }; + Ok(UpgradeOutcome { + upgraded, + rebooted: total_rebooted, + initial_version, + final_version: current_version, + iterations, + message, + }) +} + +/// Fetch the running firmware version from `/api/core/firmware/info`. +async fn read_firmware_version(client: &OpnsenseClient) -> Result { + let info: serde_json::Value = + client + .get_typed("core", "firmware", "info") + .await + .map_err(|e| FirmwareUpgradeError::Api { + phase: "firmware/info", + msg: e.to_string(), + })?; + Ok(info["product_version"] + .as_str() + .unwrap_or("") + .to_string()) +} + +/// Poll `/api/core/firmware/upgradestatus` until it reports `status == "done"`. +/// +/// Tolerates transient errors (the endpoint is documented as +/// "known to be unstable" in OPNsense 26.1.6 release notes — the WebUI +/// itself traps its errors). A 404 between tasks is treated as "still in +/// progress, keep polling." +async fn wait_for_task_done( + client: &OpnsenseClient, + task_label: &str, + timeout: Duration, + tag: &str, +) -> Result<(), FirmwareUpgradeError> { + let deadline = Instant::now() + timeout; + let mut last_logged: Option = None; + while Instant::now() < deadline { + tokio::time::sleep(Duration::from_secs(3)).await; match client - .get_typed::("core", "firmware", "status") + .get_typed::("core", "firmware", "upgradestatus") .await { Ok(s) => { - let active = s["status"].as_str().unwrap_or(""); - if active.is_empty() || active == "none" { - needs_reboot = s["status_reboot"].as_str() == Some("1"); - info!( - "{tag} Upgrade finished (status={active:?}, needs_reboot={needs_reboot})" - ); - break; + let st = s["status"].as_str().unwrap_or("").to_string(); + if st == "done" { + info!("{tag} firmware/{task_label} task reported done"); + return Ok(()); + } + if last_logged.as_deref() != Some(st.as_str()) { + debug!("{tag} firmware/{task_label} task status: {st:?}"); + last_logged = Some(st); + } + } + Err(e) => { + debug!("{tag} upgradestatus poll error during {task_label}: {e}; retrying"); + } + } + } + Err(FirmwareUpgradeError::Timeout(format!( + "firmware/{task_label} did not reach 'done' within {timeout:?}" + ))) +} + +/// Internal helper return. +struct TaskOutcome { + rebooted: bool, +} + +/// Poll `upgradestatus` until "done" — but tolerate the API going +/// unreachable as "OPNsense rebooted mid-task" and wait for it to come +/// back. Returns `rebooted=true` in that case. +async fn wait_for_task_or_reboot( + client: &OpnsenseClient, + task_label: &str, + firewall_ip: &str, + api_port: u16, + tag: &str, +) -> Result { + let deadline = Instant::now() + TASK_DONE_TIMEOUT; + let mut last_logged: Option = None; + while Instant::now() < deadline { + tokio::time::sleep(Duration::from_secs(5)).await; + match client + .get_typed::("core", "firmware", "upgradestatus") + .await + { + Ok(s) => { + let st = s["status"].as_str().unwrap_or("").to_string(); + if st == "done" { + info!("{tag} firmware/{task_label} task reported done"); + return Ok(TaskOutcome { rebooted: false }); + } + if last_logged.as_deref() != Some(st.as_str()) { + debug!("{tag} firmware/{task_label} task status: {st:?}"); + last_logged = Some(st); } - debug!("{tag} ...upgrade in progress (status={active})"); } Err(_) => { - info!("{tag} API unreachable mid-upgrade — OPNsense auto-rebooted"); - rebooted = true; - break; + // Could be a transient 404 (endpoint instability) or a real + // reboot. Distinguish with a TCP probe. + if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + info!("{tag} firmware/{task_label}: API unreachable — OPNsense is rebooting"); + wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; + return Ok(TaskOutcome { rebooted: true }); + } + // Otherwise transient endpoint flake; keep polling. } } } + Err(FirmwareUpgradeError::Timeout(format!( + "firmware/{task_label} did not finish within {:?}", + TASK_DONE_TIMEOUT + ))) +} - if needs_reboot && !rebooted { - info!("{tag} Triggering explicit reboot via firmware/reboot ..."); - // Fire-and-forget — the server tears down its TCP connection while - // replying. An Err here is expected and harmless. - let _ = client - .post_typed::("core", "firmware", "reboot", None) - .await; - rebooted = true; +/// Wait for the firewall to go unreachable, come back, and settle. +/// +/// `firewall_ip` / `api_port` describe where the API should re-appear. +async fn wait_for_reboot_cycle( + firewall_ip: &str, + api_port: u16, + tag: &str, +) -> Result<(), FirmwareUpgradeError> { + info!("{tag} Waiting for the API to go unreachable (reboot in flight) ..."); + let unreach_deadline = Instant::now() + Duration::from_secs(60); + while Instant::now() < unreach_deadline { + tokio::time::sleep(Duration::from_secs(2)).await; + if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + info!("{tag} API unreachable — reboot in progress"); + break; + } } - if rebooted { - info!("{tag} Waiting for the API to go unreachable (reboot in flight) ..."); - let unreach_deadline = Instant::now() + Duration::from_secs(60); - let mut went_down = false; - while Instant::now() < unreach_deadline { - tokio::time::sleep(Duration::from_secs(2)).await; - if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { - info!("{tag} API unreachable — reboot in progress"); - went_down = true; - break; - } + info!("{tag} Waiting for OPNsense to come back at https://{firewall_ip}:{api_port} ..."); + let back_deadline = Instant::now() + REBOOT_RECOVERY_TIMEOUT; + let mut came_back = false; + while Instant::now() < back_deadline { + tokio::time::sleep(Duration::from_secs(5)).await; + if probe_https(firewall_ip, api_port, Duration::from_secs(5)).await { + came_back = true; + break; } - if !went_down { - return Err(FirmwareUpgradeError::RebootDidNotTake); - } - - info!("{tag} Waiting for OPNsense to come back at https://{firewall_ip}:{api_port} ..."); - let back_deadline = Instant::now() + Duration::from_secs(600); - let mut came_back = false; - while Instant::now() < back_deadline { - tokio::time::sleep(Duration::from_secs(5)).await; - if probe_https(firewall_ip, api_port, Duration::from_secs(5)).await { - came_back = true; - break; - } - } - if !came_back { - return Err(FirmwareUpgradeError::Timeout(format!( - "OPNsense did not come back at https://{firewall_ip}:{api_port} within 10 minutes" - ))); - } - - info!("{tag} Web UI reachable; giving backend services 30s to settle ..."); - tokio::time::sleep(Duration::from_secs(30)).await; - } else { - debug!("{tag} Upgrade finished without a reboot"); + } + if !came_back { + return Err(FirmwareUpgradeError::Timeout(format!( + "OPNsense did not come back at https://{firewall_ip}:{api_port} within {:?}", + REBOOT_RECOVERY_TIMEOUT + ))); } - Ok(UpgradeOutcome { - upgraded: true, - rebooted, - message: if rebooted { - "Firmware upgraded (reboot applied)".into() - } else { - "Firmware upgraded (no reboot required)".into() - }, - }) + info!("{tag} Web UI reachable; giving backend services 30s to settle ..."); + tokio::time::sleep(Duration::from_secs(30)).await; + Ok(()) } /// Bring an already-bootstrapped OPNsense firewall to the latest firmware. @@ -236,7 +401,7 @@ pub async fn perform_firmware_upgrade( /// Compose this Score right after `OPNsenseBootstrapScore` if you want /// fine-grained control of the upgrade beat. If you're happy with the /// default behavior, leave `OPNsenseBootstrapScore::upgrade_firmware` at -/// `true` instead — it calls the same helper internally. +/// `true` — it calls the same helper internally. #[derive(Debug, Clone, Serialize)] pub struct OPNsenseFirmwareUpgradeScore { /// HTTPS port the firewall's web GUI / API listens on. The default @@ -283,7 +448,15 @@ impl Interpret for OPNsenseFirmwareUpgradeInterpret { .await?; if outcome.upgraded { - Ok(Outcome::success(outcome.message)) + Ok(Outcome::success_with_details( + outcome.message.clone(), + vec![ + format!("Initial version: {}", outcome.initial_version), + format!("Final version: {}", outcome.final_version), + format!("Iterations: {}", outcome.iterations), + format!("Rebooted: {}", outcome.rebooted), + ], + )) } else { Ok(Outcome::noop(outcome.message)) } -- 2.39.5 From c4d46f1817947e39b5806271a88f007fb49668dd Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 14:12:26 -0400 Subject: [PATCH 10/38] fix(opnsense): multi-signal completion detection in firmware upgrade poll loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wait loop polled only `firmware/upgradestatus` for `status == "done"`, with a TCP-probe fallback to detect reboots. Two ways this got stuck against real OPNsense 26.1: 1. After the upgrade reboot completed, OPNsense had no active background task to track, so `upgradestatus` 404'd indefinitely. 2. The TCP-probe fallback could miss the brief unreachable window between two 5s-apart polls — if both polls saw the API up, we never set `rebooted=true` and never bailed. Net: the upgrade ran fine (26.1 → 26.1.8 applied), but our code waited forever for a "done" signal that never came. Now the wait loop polls THREE signals per iteration and exits on any: A. firmware/info `product_version` differs from version_before_action B. firmware/running `status` empty for 2 consecutive polls (configd reports no active task) C. firmware/upgradestatus `status == "done"` (when the endpoint works) Plus the TCP probe still detects mid-task reboots and waits for the firewall to come back — but it's no longer the sole exit path. After a reboot, signal A almost always wins on the first post-recovery poll. perform_firmware_upgrade now snapshots the version before each action and passes it as `version_before_action` so signal A has a baseline that's valid even after a previous iteration already bumped the version. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/opnsense/firmware_upgrade.rs | 143 ++++++++++++++---- 1 file changed, 116 insertions(+), 27 deletions(-) diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 46348119..50304f6c 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -186,8 +186,19 @@ pub async fn perform_firmware_upgrade( // ── Step 6: wait for the action to complete, possibly through // a mid-task reboot ── - let task_outcome = - wait_for_task_or_reboot(client, action_endpoint, firewall_ip, api_port, tag).await?; + // Snapshot the version BEFORE the action so the multi-signal + // waiter can detect "version moved" as completion. `current_version` + // is also valid here, but explicit naming makes the intent obvious. + let version_before_action = current_version.clone(); + let task_outcome = wait_for_task_or_reboot( + client, + action_endpoint, + firewall_ip, + api_port, + &version_before_action, + tag, + ) + .await?; let mut rebooted_this_iter = task_outcome.rebooted; // ── Step 7: if a reboot is needed but didn't happen, trigger it ── @@ -309,49 +320,127 @@ struct TaskOutcome { rebooted: bool, } -/// Poll `upgradestatus` until "done" — but tolerate the API going -/// unreachable as "OPNsense rebooted mid-task" and wait for it to come -/// back. Returns `rebooted=true` in that case. +/// Wait for a firmware-altering task (update/upgrade) to finish. +/// +/// `firmware/upgradestatus` alone is unreliable on OPNsense 26.1 — it +/// 404s when no task is registered in configd's tracking state, *including* +/// after the task completes. So we poll **three** signals each iteration +/// and exit as soon as any of them reports completion: +/// +/// - **A. version moved**: `GET firmware/info` `product_version` != +/// `version_before_action`. The definitive "the upgrade actually +/// happened" check; catches the stuck-loop case. +/// - **B. configd idle**: `GET firmware/running` `status` field empty +/// for two consecutive polls. Used for flows that don't change the +/// product_version (e.g. package-only updates). +/// - **C. upgradestatus done**: `GET firmware/upgradestatus` returns +/// `status == "done"`. Honored when it works; 404s are ignored. +/// +/// A TCP probe at the start of each iteration also catches reboots. If +/// unreachable, we wait for the reboot cycle to complete, then continue +/// polling — signal A almost always wins on the first poll after the +/// firewall comes back. async fn wait_for_task_or_reboot( client: &OpnsenseClient, task_label: &str, firewall_ip: &str, api_port: u16, + version_before_action: &str, tag: &str, ) -> Result { + const IDLE_THRESHOLD: u32 = 2; + let poll_interval = Duration::from_secs(5); let deadline = Instant::now() + TASK_DONE_TIMEOUT; - let mut last_logged: Option = None; + let mut rebooted = false; + let mut consecutive_idle: u32 = 0; + let mut last_running: Option = None; + let mut last_upgradestatus: Option = None; + while Instant::now() < deadline { - tokio::time::sleep(Duration::from_secs(5)).await; + tokio::time::sleep(poll_interval).await; + + // ── Reboot detection ──────────────────────────────────────── + if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + info!("{tag} firmware/{task_label}: API unreachable — OPNsense is rebooting"); + wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; + rebooted = true; + // Task state on the other side of a reboot is fresh — reset + // the idle counter so we read clean post-reboot signals. + consecutive_idle = 0; + continue; + } + + // ── Signal A: version moved ───────────────────────────────── + // Definitive completion signal. Catches the case where + // upgradestatus 404s forever after a real upgrade. match client + .get_typed::("core", "firmware", "info") + .await + { + Ok(info) => { + let v = info["product_version"].as_str().unwrap_or("").trim(); + if !v.is_empty() && v != version_before_action { + info!( + "{tag} firmware/{task_label}: version moved {version_before_action} → {v}; \ + task complete" + ); + return Ok(TaskOutcome { rebooted }); + } + } + Err(e) => { + debug!("{tag} firmware/info poll error: {e}; retrying"); + } + } + + // ── Signal B: configd reports no running task ─────────────── + match client + .get_typed::("core", "firmware", "running") + .await + { + Ok(running) => { + let st = running["status"].as_str().unwrap_or("").trim().to_string(); + if st.is_empty() || st == "none" { + consecutive_idle += 1; + if consecutive_idle >= IDLE_THRESHOLD { + info!( + "{tag} firmware/{task_label}: configd idle for {consecutive_idle} \ + polls; task complete" + ); + return Ok(TaskOutcome { rebooted }); + } + } else { + if last_running.as_deref() != Some(st.as_str()) { + debug!("{tag} firmware/running: {st:?}"); + last_running = Some(st); + } + consecutive_idle = 0; + } + } + Err(e) => { + debug!("{tag} firmware/running poll error: {e}; retrying"); + } + } + + // ── Signal C: upgradestatus reports "done" ────────────────── + // 404s ignored — known to be unstable on OPNsense 26.1. + if let Ok(s) = client .get_typed::("core", "firmware", "upgradestatus") .await { - Ok(s) => { - let st = s["status"].as_str().unwrap_or("").to_string(); - if st == "done" { - info!("{tag} firmware/{task_label} task reported done"); - return Ok(TaskOutcome { rebooted: false }); - } - if last_logged.as_deref() != Some(st.as_str()) { - debug!("{tag} firmware/{task_label} task status: {st:?}"); - last_logged = Some(st); - } + let st = s["status"].as_str().unwrap_or("").to_string(); + if st == "done" { + info!("{tag} firmware/{task_label}: upgradestatus reports done"); + return Ok(TaskOutcome { rebooted }); } - Err(_) => { - // Could be a transient 404 (endpoint instability) or a real - // reboot. Distinguish with a TCP probe. - if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { - info!("{tag} firmware/{task_label}: API unreachable — OPNsense is rebooting"); - wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; - return Ok(TaskOutcome { rebooted: true }); - } - // Otherwise transient endpoint flake; keep polling. + if last_upgradestatus.as_deref() != Some(st.as_str()) { + debug!("{tag} firmware/upgradestatus: {st:?}"); + last_upgradestatus = Some(st); } } } + Err(FirmwareUpgradeError::Timeout(format!( - "firmware/{task_label} did not finish within {:?}", + "firmware/{task_label} did not complete within {:?}", TASK_DONE_TIMEOUT ))) } -- 2.39.5 From 1b718ef6c83659d79bf3bffeefbd0ec3e6a219ef Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 14:44:47 -0400 Subject: [PATCH 11/38] refactor(opnsense-api): truncate HTTP response body in WARN logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `OpnsenseClient::handle_response_typed` logged the entire response body in its WARN line on non-success status codes. OPNsense's 404 page is ~12 lines of HTML; every transient 404 (and the firmware/upgradestatus endpoint 404s constantly on 26.1, per release notes calling it "known to be unstable") dumped a multi-line block into the log. Route the body through a new truncate_for_log helper that keeps the first non-empty line, caps the result at 200 chars, and appends an ellipsis if anything was elided. JSON error responses (typically one short line) stay intact; HTML pages collapse to "…". The `Error::Api { body, .. }` value passed to callers is unchanged, so code that wants to inspect the full body still can. Three unit tests cover: short-line passthrough, HTML collapsing, length-capped ellipsis. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-api/src/client.rs | 63 +++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/opnsense-api/src/client.rs b/opnsense-api/src/client.rs index 43d3dbd4..0211e3d9 100644 --- a/opnsense-api/src/client.rs +++ b/opnsense-api/src/client.rs @@ -405,7 +405,13 @@ impl OpnsenseClient { Ok(json) } else { let body = response.text().await.unwrap_or_default(); - warn!(target: "opnsense-api", "{} {} → HTTP {status}: {}", method, url, body); + warn!( + target: "opnsense-api", + "{} {} → HTTP {status}: {}", + method, + url, + truncate_for_log(&body) + ); Err(Error::Api { status, method: method.to_string(), @@ -415,3 +421,58 @@ impl OpnsenseClient { } } } + +/// Squeeze an HTTP response body down to one short line suitable for a +/// log message. +/// +/// OPNsense's 404 (and many other error) pages are full HTML documents; +/// dumping them verbatim into the log makes WARN lines hundreds of +/// characters across multiple lines. This keeps the first non-empty line +/// (most of the time the document's first tag, e.g. ``), +/// trims it to ≤ 200 chars, and appends "…" if anything was elided. The +/// `Error::Api { body, .. }` value passed to callers is unchanged, so +/// code that needs the full body still has it. +fn truncate_for_log(body: &str) -> std::borrow::Cow<'_, str> { + const MAX: usize = 200; + let first_line = body.lines().find(|l| !l.trim().is_empty()).unwrap_or(""); + let trimmed = first_line.trim(); + let truncated_to_first_line = trimmed.len() < body.trim().len(); + let truncated_by_length = trimmed.len() > MAX; + if !truncated_to_first_line && !truncated_by_length { + std::borrow::Cow::Borrowed(trimmed) + } else { + let cut = trimmed + .char_indices() + .nth(MAX) + .map(|(i, _)| i) + .unwrap_or(trimmed.len()); + std::borrow::Cow::Owned(format!("{}…", &trimmed[..cut])) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn truncate_short_single_line_is_unchanged() { + let body = r#"{"error":"not found"}"#; + assert_eq!(truncate_for_log(body), body); + } + + #[test] + fn truncate_html_keeps_first_line_only() { + let body = "\n\n \n 404 Not Found\n \n\n"; + let out = truncate_for_log(body); + assert_eq!(out, "…"); + } + + #[test] + fn truncate_caps_at_200_chars_with_ellipsis() { + let body = "x".repeat(500); + let out = truncate_for_log(&body); + assert!(out.ends_with('…'), "expected ellipsis suffix, got {out:?}"); + // chars() not bytes() — ellipsis is multi-byte. + assert_eq!(out.chars().count(), 201); + } +} -- 2.39.5 From 6115eb1a2328a30c846f23331592b3773a8743db Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 12 May 2026 14:44:47 -0400 Subject: [PATCH 12/38] fix(opnsense): trust the reboot as definitive task completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a firmware update has status_reboot=1, the reboot IS the final step of the install — once it completes, the task is done by definition. But my multi-signal polling loop kept trying to verify completion via signals A/B/C *after* the reboot, and all three are unreliable post-reboot: A. firmware/info `product_version` doesn't change if the update was package-only (no version bump). B. firmware/running keeps reporting the previous task as active until the next firmware/check kicks it — the operator observed "clicking 'check for updates' in the UI unstuck it", confirming OPNsense retains stale task state until a fresh check resets it. C. firmware/upgradestatus 404s (documented unstable on 26.1) when no task is registered, which is the state after a real upgrade. Net: in iteration 2 of a real upgrade run (26.1.8 → 26.1.x with 2 more packages), the wait loop was stuck silently polling for several minutes after the firewall had already rebooted and was fully operational. Now: when the TCP probe detects unreachable and wait_for_reboot_cycle returns, immediately return TaskOutcome { rebooted: true } instead of re-entering the polling loop. The outer perform_firmware_upgrade loop already calls firmware/check at the top of the next iteration (which both refreshes OPNsense's task state AND tells us if more updates are pending) and reads firmware/info after wait_for_task_or_reboot returns to verify the version moved. Those are the real post-reboot completion signals — the in-loop polling was redundant and harmful. The non-reboot path (status_reboot=0 updates, e.g. pure metadata refresh) is unchanged: signals A/B/C still run because there's no reboot to use as a definitive completion event. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/opnsense/firmware_upgrade.rs | 63 +++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 50304f6c..e5480c1d 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -322,24 +322,29 @@ struct TaskOutcome { /// Wait for a firmware-altering task (update/upgrade) to finish. /// -/// `firmware/upgradestatus` alone is unreliable on OPNsense 26.1 — it -/// 404s when no task is registered in configd's tracking state, *including* -/// after the task completes. So we poll **three** signals each iteration -/// and exit as soon as any of them reports completion: +/// Two completion regimes, one per branch: /// -/// - **A. version moved**: `GET firmware/info` `product_version` != -/// `version_before_action`. The definitive "the upgrade actually -/// happened" check; catches the stuck-loop case. -/// - **B. configd idle**: `GET firmware/running` `status` field empty -/// for two consecutive polls. Used for flows that don't change the -/// product_version (e.g. package-only updates). -/// - **C. upgradestatus done**: `GET firmware/upgradestatus` returns -/// `status == "done"`. Honored when it works; 404s are ignored. +/// 1. **Reboot regime** — if the API goes unreachable mid-task, OPNsense +/// is rebooting. We wait for the reboot cycle to finish and return +/// immediately. The reboot completing IS the definitive completion +/// event; further polling is unreliable because OPNsense's configd +/// keeps stale task state until something kicks it (e.g. a fresh +/// `firmware/check`). The outer `perform_firmware_upgrade` loop will +/// itself call `firmware/check` at the top of the next iteration +/// and `firmware/info` for version verification — those are the +/// real post-reboot completion signals. /// -/// A TCP probe at the start of each iteration also catches reboots. If -/// unreachable, we wait for the reboot cycle to complete, then continue -/// polling — signal A almost always wins on the first poll after the -/// firewall comes back. +/// 2. **No-reboot regime** — for `status_reboot=0` updates (e.g. pure +/// package metadata refresh), we poll three signals every iteration +/// and exit on any of them: +/// +/// - **A. version moved**: `GET firmware/info` `product_version` != +/// `version_before_action`. +/// - **B. configd idle**: `GET firmware/running` `status` field +/// empty for two consecutive polls. +/// - **C. upgradestatus done**: `GET firmware/upgradestatus` returns +/// `status == "done"`. 404s are ignored (documented unstable on +/// OPNsense 26.1). async fn wait_for_task_or_reboot( client: &OpnsenseClient, task_label: &str, @@ -351,7 +356,9 @@ async fn wait_for_task_or_reboot( const IDLE_THRESHOLD: u32 = 2; let poll_interval = Duration::from_secs(5); let deadline = Instant::now() + TASK_DONE_TIMEOUT; - let mut rebooted = false; + // No `mut rebooted` here: the reboot branch returns immediately with + // rebooted=true, and the polling branches below only fire when no + // reboot was observed. let mut consecutive_idle: u32 = 0; let mut last_running: Option = None; let mut last_upgradestatus: Option = None; @@ -360,14 +367,20 @@ async fn wait_for_task_or_reboot( tokio::time::sleep(poll_interval).await; // ── Reboot detection ──────────────────────────────────────── + // A reboot during a firmware-altering task IS the completion + // event — OPNsense schedules the reboot as the final install + // step. Don't poll signals A/B/C afterward: OPNsense's configd + // keeps the task marked as "running" until the next + // firmware/check kicks it, so signals B and C stay misleading, + // and signal A is unreliable for package-only updates that + // don't bump product_version. The outer loop's next iteration + // will trigger its own firmware/check and verify versions + // explicitly — that's the real post-reboot completion signal. if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { info!("{tag} firmware/{task_label}: API unreachable — OPNsense is rebooting"); wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; - rebooted = true; - // Task state on the other side of a reboot is fresh — reset - // the idle counter so we read clean post-reboot signals. - consecutive_idle = 0; - continue; + info!("{tag} firmware/{task_label}: reboot cycle complete; treating as task complete"); + return Ok(TaskOutcome { rebooted: true }); } // ── Signal A: version moved ───────────────────────────────── @@ -384,7 +397,7 @@ async fn wait_for_task_or_reboot( "{tag} firmware/{task_label}: version moved {version_before_action} → {v}; \ task complete" ); - return Ok(TaskOutcome { rebooted }); + return Ok(TaskOutcome { rebooted: false }); } } Err(e) => { @@ -406,7 +419,7 @@ async fn wait_for_task_or_reboot( "{tag} firmware/{task_label}: configd idle for {consecutive_idle} \ polls; task complete" ); - return Ok(TaskOutcome { rebooted }); + return Ok(TaskOutcome { rebooted: false }); } } else { if last_running.as_deref() != Some(st.as_str()) { @@ -430,7 +443,7 @@ async fn wait_for_task_or_reboot( let st = s["status"].as_str().unwrap_or("").to_string(); if st == "done" { info!("{tag} firmware/{task_label}: upgradestatus reports done"); - return Ok(TaskOutcome { rebooted }); + return Ok(TaskOutcome { rebooted: false }); } if last_upgradestatus.as_deref() != Some(st.as_str()) { debug!("{tag} firmware/upgradestatus: {st:?}"); -- 2.39.5 From 2c658b8ce8248971e4c28b78bc5c0c870b508ab5 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:00:57 -0400 Subject: [PATCH 13/38] feat(opnsense): FirmwareUpgradeMode enum (Auto / AutoMinor / Prompt / Disabled) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the boolean `OPNsenseBootstrapScore::upgrade_firmware` knob with a four-variant enum that decides per pending upgrade whether to apply it automatically, skip major-series upgrades, prompt the operator, or skip entirely. Also exposed via `OPNsenseFirmwareUpgradeScore::mode` for standalone composition. - `Auto` (default): apply every pending update and upgrade. - `AutoMinor`: apply in-series updates (status == "update"); skip major-series upgrades (status == "upgrade"). Uses OPNsense's own `status` field for the major/minor distinction — no version-string parsing. - `Prompt`: print a per-iteration summary and ask via inquire::Confirm. Errors out with `PromptRequiresTty` when run headless (no TTY) so CI contexts must pick `Auto` / `AutoMinor` / `Disabled` explicitly. - `Disabled`: skip the upgrade step entirely. The summary surfaced for Prompt (and logged in Auto/AutoMinor too) includes: - status_msg (OPNsense's "108 updates available, 349.2 MiB, reboot required" line) - whether the OPNsense product package itself is being upgraded ("Main OPNsense: 26.1 → 26.1.8") or whether the update only touches plugins/packages ("Main OPNsense: staying at ") - kind (update vs upgrade) - reboot required (yes/no) Two new helpers — `extract_opnsense_version_change` and `render_upgrade_summary` — pull the version diff out of `status.all_packages` / `status.all_sets` (looking for the `opnsense` or `opnsense-update` entry) and assemble the human-readable block. Wired through: - `OPNsenseFirmwareUpgradeScore::mode` (default Auto). - `OPNsenseBootstrapScore::firmware_upgrade` (replaces `upgrade_firmware: bool`; same default behavior). - `examples/opnsense_vm_integration` opts out with `firmware_upgrade: FirmwareUpgradeMode::Disabled`. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 7 +- .../src/modules/opnsense/bootstrap_score.rs | 44 ++-- .../src/modules/opnsense/firmware_upgrade.rs | 244 +++++++++++++++++- 3 files changed, 269 insertions(+), 26 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index e28f586b..69f61479 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -36,6 +36,7 @@ use harmony::modules::opnsense::dnat::{DnatRuleDef, DnatScore}; use harmony::modules::opnsense::firewall::{ BinatRuleDef, BinatScore, FilterRuleDef, FirewallRuleScore, OutboundNatScore, SnatRuleDef, }; +use harmony::modules::opnsense::firmware_upgrade::FirmwareUpgradeMode; use harmony::modules::opnsense::lagg::{LaggDef, LaggScore}; use harmony::modules::opnsense::node_exporter::NodeExporterScore; use harmony::modules::opnsense::vip::{VipDef, VipScore}; @@ -209,8 +210,10 @@ async fn boot_vm( vec![Box::new(OPNsenseBootstrapScore { target_api_port: OPN_API_PORT, // The VM image is a known firmware version; we don't want - // each integration run to spend 10+ minutes upgrading. - upgrade_firmware: false, + // each integration run to spend 10+ minutes upgrading. The + // operator can override to `Auto` / `AutoMinor` / `Prompt` + // locally when testing the upgrade beat. + firmware_upgrade: FirmwareUpgradeMode::Disabled, ..Default::default() })]; let bootstrap_args = harmony_cli::Args { diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 9d77218f..1c1b968a 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -10,9 +10,11 @@ //! 3. SSHes in, mints an API key + secret on the root user, and persists //! both `OPNSenseApiCredentials` and `OPNSenseFirewallCredentials` to //! `harmony_secret::SecretManager`. -//! 4. (Default-on, via `upgrade_firmware`) Brings the firewall up to the +//! 4. (Default-on, via `firmware_upgrade`) Brings the firewall up to the //! latest firmware/package level using the same logic as //! [`OPNsenseFirmwareUpgradeScore`](crate::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore). +//! Configurable via `FirmwareUpgradeMode` (Auto / AutoMinor / Prompt / +//! Disabled). //! 5. Optionally rebinds the LAN to a new IP/subnet. //! //! After it runs, callers construct a normal @@ -42,7 +44,7 @@ use crate::{ modules::opnsense::bootstrap::{ OPNsenseBootstrap, change_lan_ip_via_ssh, create_api_key_ssh, probe_https, }, - modules::opnsense::firmware_upgrade::perform_firmware_upgrade, + modules::opnsense::firmware_upgrade::{FirmwareUpgradeMode, perform_firmware_upgrade}, score::Score, topology::OPNsenseBootstrapTopology, }; @@ -70,13 +72,17 @@ pub struct OPNsenseBootstrapScore { /// Required when something else needs to bind `0.0.0.0:80` (e.g. /// HAProxy on a CARP VIP). pub disable_http_redirect: bool, - /// If `true` (default), bring the firewall to the latest available - /// firmware/package level immediately after credentials are persisted - /// and before any optional LAN rebind. Set to `false` for VM - /// integration tests, air-gapped environments, or pinned-version - /// deployments. The underlying logic lives in + /// How aggressively to apply pending firmware updates immediately after + /// credentials are persisted and before any optional LAN rebind. + /// + /// Defaults to `FirmwareUpgradeMode::Auto` (apply everything). Use + /// `AutoMinor` to skip major-series upgrades, `Prompt` to ask the + /// operator for each pending update, or `Disabled` to skip the upgrade + /// step entirely (e.g. for VM integration tests, air-gapped + /// environments, or pinned-version deployments). The underlying logic + /// lives in /// [`crate::modules::opnsense::firmware_upgrade::perform_firmware_upgrade`]. - pub upgrade_firmware: bool, + pub firmware_upgrade: FirmwareUpgradeMode, } impl Default for OPNsenseBootstrapScore { @@ -86,7 +92,7 @@ impl Default for OPNsenseBootstrapScore { target_lan: None, webgui_ready_timeout: std::time::Duration::from_secs(120), disable_http_redirect: false, - upgrade_firmware: true, + firmware_upgrade: FirmwareUpgradeMode::Auto, } } } @@ -299,8 +305,11 @@ impl Interpret for OPNsenseBootstrapInterpret { // happens against `vanilla_ip` — known reachable from here. The // firewall will come back at `vanilla_ip:target_api_port`, then // the rebind moves it onward. - if self.score.upgrade_firmware { - info!("{tag} Upgrading firmware to latest before optional LAN rebind ..."); + if self.score.firmware_upgrade != FirmwareUpgradeMode::Disabled { + info!( + "{tag} Running firmware upgrade (mode={:?}) before optional LAN rebind ...", + self.score.firmware_upgrade + ); let client = opnsense_api::OpnsenseClient::builder() .base_url(format!( "https://{vanilla_ip}:{}/api", @@ -315,12 +324,17 @@ impl Interpret for OPNsenseBootstrapInterpret { "Failed to build OPNsense client for firmware upgrade: {e}" )) })?; - let outcome = - perform_firmware_upgrade(&client, &vanilla_ip, self.score.target_api_port, &tag) - .await?; + let outcome = perform_firmware_upgrade( + &client, + &vanilla_ip, + self.score.target_api_port, + self.score.firmware_upgrade, + &tag, + ) + .await?; info!("{tag} Firmware upgrade outcome: {}", outcome.message); } else { - info!("{tag} upgrade_firmware=false; skipping firmware upgrade"); + info!("{tag} firmware_upgrade=Disabled; skipping firmware upgrade"); } // ── Step 5: optional LAN rebind ────────────────────────────── diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index e5480c1d..600f9012 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -47,6 +47,44 @@ const TASK_DONE_TIMEOUT: Duration = Duration::from_secs(1200); /// How long to wait for the API to come back after a reboot. const REBOOT_RECOVERY_TIMEOUT: Duration = Duration::from_secs(600); +/// How the firmware-upgrade helper decides whether (and how) to apply a +/// pending update. +/// +/// OPNsense's `firmware/status` endpoint returns the kind of pending change +/// in its `status` field: +/// +/// - `status == "update"` — in-series package update (e.g. 26.1 → 26.1.8). +/// Considered **minor**. +/// - `status == "upgrade"` — major-series upgrade (e.g. 26.1 → 26.7). +/// Considered **major**. +/// +/// This enum gates which kinds get applied automatically vs. require the +/// operator's explicit approval. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +pub enum FirmwareUpgradeMode { + /// Apply every pending update and upgrade automatically. Latest version + /// always wins. + Auto, + /// Apply in-series updates (`status == "update"`) automatically but + /// skip major-series upgrades (`status == "upgrade"`). The Score + /// returns success without applying the major; rerun with `Auto` or + /// `Prompt` to pick it up. + AutoMinor, + /// For each pending update, print a summary and ask the operator + /// `[Y/n]` via stdin. Fails with a clear error if there is no TTY + /// (CI/headless contexts must pick `Auto`, `AutoMinor`, or `Disabled` + /// explicitly). + Prompt, + /// Skip firmware upgrades entirely. + Disabled, +} + +impl Default for FirmwareUpgradeMode { + fn default() -> Self { + FirmwareUpgradeMode::Auto + } +} + /// Errors the firmware-upgrade helper may surface. #[derive(Debug, Error)] pub enum FirmwareUpgradeError { @@ -60,6 +98,14 @@ pub enum FirmwareUpgradeError { UnexpectedStatus(String), #[error("Reached max upgrade iterations ({0}); firmware may have further pending updates")] TooManyIterations(u32), + #[error( + "FirmwareUpgradeMode::Prompt requires an interactive TTY. \ + Run in a terminal, or pick FirmwareUpgradeMode::Auto / AutoMinor / Disabled \ + for headless/CI contexts." + )] + PromptRequiresTty, + #[error("Operator declined the firmware update via interactive prompt")] + DeclinedByOperator, } impl From for InterpretError { @@ -87,19 +133,34 @@ pub struct UpgradeOutcome { /// Bring an OPNsense firewall to the latest firmware/package level. /// -/// `firewall_ip` and `api_port` are needed for post-reboot reachability -/// probes — the `OpnsenseClient` already knows them, but doesn't expose -/// them. `tag` is a short identifier (typically an IP) used as a log -/// prefix so this helper can be called from multiple contexts without -/// making log lines ambiguous. +/// `mode` gates whether and how each pending update is applied (see +/// [`FirmwareUpgradeMode`]). `firewall_ip` and `api_port` are needed for +/// post-reboot reachability probes — the `OpnsenseClient` already knows +/// them but doesn't expose them. `tag` is a short identifier (typically +/// an IP) used as a log prefix so this helper can be called from +/// multiple contexts without making log lines ambiguous. /// /// See module-level docs for the algorithm. pub async fn perform_firmware_upgrade( client: &OpnsenseClient, firewall_ip: &str, api_port: u16, + mode: FirmwareUpgradeMode, tag: &str, ) -> Result { + // ── Disabled short-circuit ────────────────────────────────────── + if mode == FirmwareUpgradeMode::Disabled { + let v = read_firmware_version(client).await?; + info!("{tag} firmware_upgrade mode=Disabled; skipping"); + return Ok(UpgradeOutcome { + upgraded: false, + rebooted: false, + initial_version: v.clone(), + final_version: v, + iterations: 0, + message: "Firmware upgrade skipped (mode=Disabled)".into(), + }); + } // ── Step 1: capture the initial version ────────────────────────── let initial_version = read_firmware_version(client).await?; info!("{tag} Initial firmware version: {initial_version}"); @@ -174,6 +235,102 @@ pub async fn perform_firmware_upgrade( } }; + // ── Step 4b: mode-gating ───────────────────────────────────── + // Build a human-readable summary now so we can log it (and feed + // it to a prompt if needed). + let opnsense_change = extract_opnsense_version_change(&status); + let summary = render_upgrade_summary( + &status_msg, + action_endpoint, + ¤t_version, + opnsense_change.as_ref(), + needs_reboot, + ); + info!("{tag} Pending firmware {action_endpoint}:\n{summary}"); + + match mode { + FirmwareUpgradeMode::Disabled => { + // Unreachable — handled at the top of the function — but + // exhaustiveness is nice. + unreachable!("FirmwareUpgradeMode::Disabled short-circuits earlier"); + } + FirmwareUpgradeMode::Auto => { + // Proceed for both "update" and "upgrade". + } + FirmwareUpgradeMode::AutoMinor => { + if action_endpoint == "upgrade" { + info!( + "{tag} mode=AutoMinor; skipping major-series upgrade. \ + Rerun with FirmwareUpgradeMode::Auto or Prompt to apply it." + ); + let final_message = if applied_any { + format!( + "Firmware: {initial_version} → {current_version} \ + in {iterations} iteration(s); stopped before major-series \ + upgrade (mode=AutoMinor)" + ) + } else { + format!( + "Major-series upgrade available but skipped (mode=AutoMinor); \ + firmware unchanged at {current_version}" + ) + }; + return Ok(UpgradeOutcome { + upgraded: applied_any, + rebooted: total_rebooted, + initial_version: initial_version.clone(), + final_version: current_version, + iterations, + message: final_message, + }); + } + } + FirmwareUpgradeMode::Prompt => { + let header = format!("Apply this firmware {action_endpoint} on {firewall_ip}?"); + let prompt_text = format!("{header}\n{summary}\n"); + let answer = inquire::Confirm::new(&prompt_text) + .with_default(true) + .prompt(); + match answer { + Ok(true) => { + info!("{tag} Operator accepted the {action_endpoint}"); + } + Ok(false) => { + info!("{tag} Operator declined the {action_endpoint}"); + let final_message = if applied_any { + format!( + "Firmware: {initial_version} → {current_version} \ + in {iterations} iteration(s); stopped after operator declined \ + the next {action_endpoint}" + ) + } else { + format!( + "Firmware {action_endpoint} available but declined by operator; \ + firmware unchanged at {current_version}" + ) + }; + return Ok(UpgradeOutcome { + upgraded: applied_any, + rebooted: total_rebooted, + initial_version: initial_version.clone(), + final_version: current_version, + iterations, + message: final_message, + }); + } + Err(inquire::InquireError::NotTTY) => { + return Err(FirmwareUpgradeError::PromptRequiresTty); + } + Err(e) => { + return Err(FirmwareUpgradeError::Api { + phase: "interactive prompt", + msg: e.to_string(), + }); + } + } + } + } + // ── Step 5: trigger the action ─────────────────────────────── info!("{tag} Triggering firmware/{action_endpoint} (async) ..."); let _: serde_json::Value = client @@ -259,6 +416,65 @@ pub async fn perform_firmware_upgrade( } /// Fetch the running firmware version from `/api/core/firmware/info`. +/// The version transition for the `opnsense` package itself, if it appears +/// in this update's package list. +struct OpnsensePackageChange { + old: String, + new: String, +} + +/// Look for an entry named `"opnsense"` in `status.all_packages` (status = +/// "update") or `status.all_sets` (status = "upgrade") and capture its +/// `old` → `new` version transition. +fn extract_opnsense_version_change(status: &serde_json::Value) -> Option { + // `all_packages` and `all_sets` are objects keyed by package name; the + // `opnsense` package being touched means a product-level version bump. + for field in ["all_packages", "all_sets"] { + if let Some(map) = status[field].as_object() + && let Some(entry) = map.get("opnsense").or_else(|| map.get("opnsense-update")) + { + let old = entry["old"].as_str().unwrap_or("").trim().to_string(); + let new = entry["new"].as_str().unwrap_or("").trim().to_string(); + if !new.is_empty() { + return Some(OpnsensePackageChange { old, new }); + } + } + } + None +} + +/// Build a short human-readable summary of a pending firmware update. +fn render_upgrade_summary( + status_msg: &str, + action_endpoint: &str, + current_version: &str, + opnsense_change: Option<&OpnsensePackageChange>, + needs_reboot: bool, +) -> String { + let main_version_line = match opnsense_change { + Some(c) => format!( + " Main OPNsense: {} → {} (the `opnsense` package itself is being updated)", + if c.old.is_empty() { "?" } else { &c.old }, + c.new + ), + None => format!( + " Main OPNsense: staying at {current_version} \ + (this update only touches packages, not the main OPNsense version)" + ), + }; + format!( + " Kind: {action_endpoint}\n\ + {main_version_line}\n\ + {summary_line}\n\ + {reboot_line}", + summary_line = format!(" Summary: {status_msg}"), + reboot_line = format!( + " Reboot needed: {}", + if needs_reboot { "yes" } else { "no" } + ), + ) +} + async fn read_firmware_version(client: &OpnsenseClient) -> Result { let info: serde_json::Value = client @@ -509,11 +725,16 @@ pub struct OPNsenseFirmwareUpgradeScore { /// HTTPS port the firewall's web GUI / API listens on. The default /// (9443) matches the value `OPNsenseBootstrapScore` moves the GUI to. pub api_port: u16, + /// How aggressive to be about applying pending updates. + pub mode: FirmwareUpgradeMode, } impl Default for OPNsenseFirmwareUpgradeScore { fn default() -> Self { - Self { api_port: 9443 } + Self { + api_port: 9443, + mode: FirmwareUpgradeMode::Auto, + } } } @@ -545,9 +766,14 @@ impl Interpret for OPNsenseFirmwareUpgradeInterpret { let tag = format!("[OPNsenseFirmwareUpgrade/{firewall_ip}]"); let config = topology.get_opnsense_config(); - let outcome = - perform_firmware_upgrade(config.client(), &firewall_ip, self.score.api_port, &tag) - .await?; + let outcome = perform_firmware_upgrade( + config.client(), + &firewall_ip, + self.score.api_port, + self.score.mode, + &tag, + ) + .await?; if outcome.upgraded { Ok(Outcome::success_with_details( -- 2.39.5 From 9eb36985ea3daba5c81e8f8e47cdb027b402fbf2 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:23:29 -0400 Subject: [PATCH 14/38] fix(opnsense-config): fast-fail install_package via firmware/running idle signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `install_package`'s poll loop only watched `firmware/info` for the positive "package installed" signal. When OPNsense's background install task failed silently (typical on stale repo metadata: pkg can't find os-haproxy that matches the firmware), the package never appeared in `firmware/info`, so the loop consumed its entire 6-minute ceiling before returning Err. The caller's fallback ("refresh metadata + retry") couldn't fire for 6+ minutes — looked like a hang. Add a second poll signal each iteration: `firmware/running` reports the name of the currently active configd task (empty when idle). When the install task vanishes (empty for 2 consecutive polls) AND the package still isn't in `firmware/info`, we know the install ended without succeeding. Fail fast with: "OPNsense install task for ended without installing the package. The repository metadata is likely stale — try refreshing it via firmware/update, or run OPNsenseFirmwareUpgradeScore first, then retry." Typical failed install now detects within ~10s instead of 6min. The 120 × 3s ceiling stays as a safety net for "task running but never completes" pathologies. This restores the fast-fail behavior the OLD pre-refactor install_package had (via its bail-on-upgradestatus-404 path), with a proper, stable signal instead of relying on the documented-unstable endpoint. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-config/src/config/config.rs | 64 +++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 568c40b2..42de9fa3 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -171,8 +171,21 @@ impl Config { /// Install an OPNsense plugin package via the firmware API. /// - /// Triggers the install, then polls `/api/core/firmware/info` until the - /// package shows up as installed (or the timeout fires). + /// Triggers the install, then polls two endpoints each iteration: + /// + /// - **A. success**: `/api/core/firmware/info` lists the package with + /// `installed == "1"` → `Ok(())`. + /// - **B. configd idle**: `/api/core/firmware/running` reports an + /// empty `status` for two consecutive polls — meaning OPNsense's + /// background install task is no longer running. Combined with + /// "package not in firmware/info", this is the install-failed + /// signal → `Err(Error::PackageInstall(...))`. Common when the + /// repository metadata is stale; a `firmware/update` (or a full + /// `OPNsenseFirmwareUpgradeScore`) is typically needed first. + /// + /// The `firmware/upgradestatus` endpoint is intentionally not used — + /// it's documented as "known to be unstable" in OPNsense 26.1.6 release + /// notes and the WebUI itself traps its generic error popup. pub async fn install_package(&self, package_name: &str) -> Result<(), Error> { info!("Installing OPNsense package {package_name}"); @@ -199,18 +212,15 @@ impl Config { resp.msg_uuid ); - // Poll the ground-truth signal: `/api/core/firmware/info` lists every - // package with `installed == "1"` once OPNsense has finished applying - // the install. The legacy approach polled `/api/core/firmware/upgradestatus`, - // which OPNsense's own 26.1.6 release notes mark as "known to be - // unstable" (the WebUI traps its generic error popup). Polling - // `firmware/info` removes that dependency entirely and lets us - // tolerate transient API errors (e.g. if the install transiently - // takes the API offline). let poll_interval = std::time::Duration::from_secs(3); - let max_attempts = 120; // 6 minutes + let max_attempts = 120; // 6 minutes — safety ceiling + const IDLE_THRESHOLD: u32 = 2; + let mut consecutive_idle: u32 = 0; + for attempt in 0..max_attempts { tokio::time::sleep(poll_interval).await; + + // Signal A — success: package shows up as installed. match self .client .get_typed::("core", "firmware", "info") @@ -236,6 +246,38 @@ impl Config { ); } } + + // Signal B — failure detection: configd is no longer running + // a task. If the package didn't appear (Signal A above) AND no + // task is running for two consecutive polls, the install + // definitively ended without succeeding. + match self + .client + .get_typed::("core", "firmware", "running") + .await + { + Ok(running) => { + let st = running["status"].as_str().unwrap_or("").trim().to_string(); + if st.is_empty() || st == "none" { + consecutive_idle += 1; + if consecutive_idle >= IDLE_THRESHOLD { + return Err(Error::PackageInstall(format!( + "OPNsense install task for {package_name} ended without \ + installing the package. The repository metadata is likely \ + stale — try refreshing it via firmware/update, or run \ + OPNsenseFirmwareUpgradeScore first, then retry." + ))); + } + } else { + consecutive_idle = 0; + } + } + Err(e) => { + debug!( + "firmware/running poll attempt {attempt} returned transient error: {e}; retrying" + ); + } + } } let msg = format!( -- 2.39.5 From d264c84c40cc6f160a845b3d015a150d02a7f0bd Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:23:47 -0400 Subject: [PATCH 15/38] refactor(opnsense-vm-integration): use perform_firmware_upgrade in the install fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Err arm after the first `install_package("os-haproxy")` attempt used to POST `firmware/update` (= `pkg update`, repo-metadata refresh) and sleep 5s before retrying. That's a weaker, hand-rolled subset of what `OPNsenseFirmwareUpgradeScore` / `perform_firmware_upgrade` already does properly. Replace with a call to `perform_firmware_upgrade(..., FirmwareUpgradeMode::Auto, ...)`. That does the full canonical flow: firmware/check → firmware/status → firmware/update or upgrade → poll (with multi-signal completion + reboot tolerance) → verify the product_version moved. After it returns, the firewall is at the latest firmware AND its package index is current, so the retry of `install_package("os-haproxy")` finds the right packages and succeeds. This is what the operator asked for: "[on install failure] it should call the firmware update score." Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 35 +++++++++++--------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 69f61479..143d7d01 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -284,23 +284,28 @@ async fn run_integration() -> Result<(), Box> { Ok(()) => info!("os-haproxy installed"), Err(e) => { warn!("os-haproxy install failed: {e}"); - info!("Refreshing package metadata via firmware/update, then retrying..."); + info!( + "Running OPNsense firmware upgrade (check → update / upgrade → reboot \ + → verify) to refresh metadata + apply pending updates, then retrying ..." + ); - // `firmware/update` is OPNsense's API hook for `pkg update` - // (refresh repository metadata). The first install attempt - // typically fails on a freshly bootstrapped firewall because - // the package index hasn't been pulled yet; this kicks it. - let _: serde_json::Value = config - .client() - .post_typed("core", "firmware", "update", None::<&()>) + // Use the canonical Score-shaped helper instead of a + // hand-rolled firmware/update call. It refreshes metadata, + // applies whatever's actionable, handles reboots, and + // verifies the version moved — exactly what a stale + // post-bootstrap firewall needs before a plugin install + // can succeed. + let outcome = + harmony::modules::opnsense::firmware_upgrade::perform_firmware_upgrade( + config.client(), + OPN_LAN_IP, + OPN_API_PORT, + harmony::modules::opnsense::firmware_upgrade::FirmwareUpgradeMode::Auto, + "[VmIntegration]", + ) .await - .map_err(|e| format!("firmware/update failed: {e}"))?; - - // Brief sleep for the metadata refresh to actually run. - // `install_package` itself is resilient to transient API - // errors (it polls `firmware/info` with retry tolerance), - // so we don't need to track reboot state here. - tokio::time::sleep(std::time::Duration::from_secs(5)).await; + .map_err(|e| format!("firmware upgrade failed: {e}"))?; + info!("Firmware upgrade outcome: {}", outcome.message); info!("Retrying os-haproxy install..."); config.install_package("os-haproxy").await?; -- 2.39.5 From a703442d8d8b5ccedee4dce2a80f6ce902e100a6 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:29:21 -0400 Subject: [PATCH 16/38] fix(opnsense): recognize "ready" as the idle value from firmware/running MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I added Signal B polling to install_package + wait_for_task_or_reboot checking for `status == ""` or `"none"` as the "configd idle" condition, but OPNsense's `configctl firmware running` script (core/scripts/firmware/running.sh) actually outputs `"ready"` when no firmware operation holds the lock and `"busy"` when one does. So Signal B never fired against a real OPNsense — the loop kept seeing `status: "ready"` (= idle) and treating it as "still running". For install_package this meant a doomed install still consumed the full 6-minute timeout. For wait_for_task_or_reboot it was masked by Signal A (version moved) almost always winning first, but the bug was the same. Recognize "ready" (case-insensitive) plus defensive "" / "none" as idle. Verified against the upstream script: if ${FLOCK} -n 9; then echo "ready" else echo "busy" fi Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/opnsense/firmware_upgrade.rs | 14 ++++++++++++-- opnsense-config/src/config/config.rs | 19 +++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 600f9012..494c90c5 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -627,8 +627,18 @@ async fn wait_for_task_or_reboot( .await { Ok(running) => { - let st = running["status"].as_str().unwrap_or("").trim().to_string(); - if st.is_empty() || st == "none" { + // OPNsense's `configctl firmware running` script (see + // core/scripts/firmware/running.sh) prints "ready" when + // no firmware operation holds the lock and "busy" when + // one does. Recognize "ready" (and defensive variants) + // as idle. + let st = running["status"] + .as_str() + .unwrap_or("") + .trim() + .to_ascii_lowercase(); + let is_idle = st.is_empty() || st == "ready" || st == "none"; + if is_idle { consecutive_idle += 1; if consecutive_idle >= IDLE_THRESHOLD { info!( diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 42de9fa3..8a929e31 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -257,14 +257,25 @@ impl Config { .await { Ok(running) => { - let st = running["status"].as_str().unwrap_or("").trim().to_string(); - if st.is_empty() || st == "none" { + // OPNsense's `configctl firmware running` script + // (see core/scripts/firmware/running.sh) prints "ready" + // when no firmware operation holds the lock, "busy" + // when one does. Recognize "ready" (and defensive + // "" / "none" / case variants) as idle. + let st = running["status"] + .as_str() + .unwrap_or("") + .trim() + .to_ascii_lowercase(); + let is_idle = st.is_empty() || st == "ready" || st == "none"; + if is_idle { consecutive_idle += 1; if consecutive_idle >= IDLE_THRESHOLD { return Err(Error::PackageInstall(format!( "OPNsense install task for {package_name} ended without \ - installing the package. The repository metadata is likely \ - stale — try refreshing it via firmware/update, or run \ + installing the package (firmware/running idle for {consecutive_idle} \ + consecutive polls). The repository metadata is likely stale \ + — try refreshing it via firmware/update, or run \ OPNsenseFirmwareUpgradeScore first, then retry." ))); } -- 2.39.5 From 689ab8d21afcf87a82ad7339706f5ee18ca0d07a Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:42:58 -0400 Subject: [PATCH 17/38] fix(opnsense): install_package polls upgradestatus + inspects log; example uses the Score via run_cli MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related changes the operator asked for, replacing the previous "firmware/running ready/busy" heuristic that felt unclean. 1. **install_package**: drop the `firmware/info` + `firmware/running` ready-idle polling. Restore the OPNsense-native pattern: poll `/api/core/firmware/upgradestatus` until `status == "done"` (same endpoint OPNsense's own WebUI uses for its install progress popup), then verify via `firmware/info` whether the package actually got installed. On failure, surface pkg's actual error from the `log` field of the upgradestatus response (last 8 non-empty lines) plus a "run OPNsenseFirmwareUpgradeScore first" hint. Tolerate transient upgradestatus errors as the 26.1.6 release notes document the endpoint as unstable; 120 × 3 s ceiling is the safety net. Now produces the clear, fast-fail message the operator remembers from before the branch, but with the actual pkg failure reason ("pkg: No packages available to install matching 'os-haproxy'", or whatever the underlying issue is) included. 2. **opnsense_vm_integration**: the post-install-failure fallback now composes `OPNsenseFirmwareUpgradeScore { mode: Auto }` into a `Vec>>` and dispatches it via `harmony_cli::run_cli`, matching the way the rest of `run_integration` runs its Scores. Replaces the direct call to the bare `perform_firmware_upgrade()` helper. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 48 ++++---- opnsense-config/src/config/config.rs | 109 ++++++++----------- 2 files changed, 75 insertions(+), 82 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 143d7d01..58175e41 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -285,27 +285,37 @@ async fn run_integration() -> Result<(), Box> { Err(e) => { warn!("os-haproxy install failed: {e}"); info!( - "Running OPNsense firmware upgrade (check → update / upgrade → reboot \ - → verify) to refresh metadata + apply pending updates, then retrying ..." + "Running OPNsenseFirmwareUpgradeScore (mode=Auto), then retrying \ + os-haproxy install ..." ); - // Use the canonical Score-shaped helper instead of a - // hand-rolled firmware/update call. It refreshes metadata, - // applies whatever's actionable, handles reboots, and - // verifies the version moved — exactly what a stale - // post-bootstrap firewall needs before a plugin install - // can succeed. - let outcome = - harmony::modules::opnsense::firmware_upgrade::perform_firmware_upgrade( - config.client(), - OPN_LAN_IP, - OPN_API_PORT, - harmony::modules::opnsense::firmware_upgrade::FirmwareUpgradeMode::Auto, - "[VmIntegration]", - ) - .await - .map_err(|e| format!("firmware upgrade failed: {e}"))?; - info!("Firmware upgrade outcome: {}", outcome.message); + // Compose the Score and dispatch via `harmony_cli::run_cli` + // — same orchestrator path the rest of run_integration + // uses for the integration-test scores below. Goes through + // the full Score → Interpret machinery (events, logging, + // progress). + let upgrade_score = + harmony::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore { + api_port: OPN_API_PORT, + mode: FirmwareUpgradeMode::Auto, + }; + let upgrade_scores: Vec>> = + vec![Box::new(upgrade_score)]; + let upgrade_args = harmony_cli::Args { + yes: true, + filter: None, + interactive: false, + all: true, + number: 0, + list: false, + }; + harmony_cli::run_cli( + Inventory::autoload(), + opnsense.clone(), + upgrade_scores, + upgrade_args, + ) + .await?; info!("Retrying os-haproxy install..."); config.install_package("os-haproxy").await?; diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 8a929e31..789b7e68 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -171,21 +171,20 @@ impl Config { /// Install an OPNsense plugin package via the firmware API. /// - /// Triggers the install, then polls two endpoints each iteration: + /// Triggers the install asynchronously, then polls + /// `/api/core/firmware/upgradestatus` for `status == "done"` (the same + /// pattern OPNsense's own WebUI uses for its install progress popup). + /// When the task ends, verifies via `/api/core/firmware/info` whether + /// the package actually got installed: /// - /// - **A. success**: `/api/core/firmware/info` lists the package with - /// `installed == "1"` → `Ok(())`. - /// - **B. configd idle**: `/api/core/firmware/running` reports an - /// empty `status` for two consecutive polls — meaning OPNsense's - /// background install task is no longer running. Combined with - /// "package not in firmware/info", this is the install-failed - /// signal → `Err(Error::PackageInstall(...))`. Common when the - /// repository metadata is stale; a `firmware/update` (or a full - /// `OPNsenseFirmwareUpgradeScore`) is typically needed first. + /// - Installed → `Ok(())`. + /// - Not installed → `Err(Error::PackageInstall { … })`, with the + /// tail of `upgradestatus.log` (pkg's actual error output) embedded + /// in the message + a hint to run `OPNsenseFirmwareUpgradeScore`. /// - /// The `firmware/upgradestatus` endpoint is intentionally not used — - /// it's documented as "known to be unstable" in OPNsense 26.1.6 release - /// notes and the WebUI itself traps its generic error popup. + /// `upgradestatus` errors are tolerated as transient (OPNsense 26.1.6 + /// release notes mark the endpoint as unstable; the WebUI traps its + /// error popup). The 120 × 3 s ceiling is the safety net. pub async fn install_package(&self, package_name: &str) -> Result<(), Error> { info!("Installing OPNsense package {package_name}"); @@ -214,19 +213,23 @@ impl Config { let poll_interval = std::time::Duration::from_secs(3); let max_attempts = 120; // 6 minutes — safety ceiling - const IDLE_THRESHOLD: u32 = 2; - let mut consecutive_idle: u32 = 0; - for attempt in 0..max_attempts { tokio::time::sleep(poll_interval).await; - - // Signal A — success: package shows up as installed. match self .client - .get_typed::("core", "firmware", "info") + .get_typed::("core", "firmware", "upgradestatus") .await { - Ok(info) => { + Ok(s) => { + if s["status"].as_str() != Some("done") { + continue; + } + // Task ended. Did it install the package? + let info: serde_json::Value = self + .client + .get_typed("core", "firmware", "info") + .await + .map_err(Error::Api)?; let installed = info["package"] .as_array() .and_then(|pkgs| { @@ -239,53 +242,33 @@ impl Config { info!("Package {package_name} installed successfully"); return Ok(()); } - } - Err(e) => { - debug!( - "firmware/info poll attempt {attempt} returned transient error: {e}; retrying" - ); - } - } - - // Signal B — failure detection: configd is no longer running - // a task. If the package didn't appear (Signal A above) AND no - // task is running for two consecutive polls, the install - // definitively ended without succeeding. - match self - .client - .get_typed::("core", "firmware", "running") - .await - { - Ok(running) => { - // OPNsense's `configctl firmware running` script - // (see core/scripts/firmware/running.sh) prints "ready" - // when no firmware operation holds the lock, "busy" - // when one does. Recognize "ready" (and defensive - // "" / "none" / case variants) as idle. - let st = running["status"] - .as_str() - .unwrap_or("") - .trim() - .to_ascii_lowercase(); - let is_idle = st.is_empty() || st == "ready" || st == "none"; - if is_idle { - consecutive_idle += 1; - if consecutive_idle >= IDLE_THRESHOLD { - return Err(Error::PackageInstall(format!( - "OPNsense install task for {package_name} ended without \ - installing the package (firmware/running idle for {consecutive_idle} \ - consecutive polls). The repository metadata is likely stale \ - — try refreshing it via firmware/update, or run \ - OPNsenseFirmwareUpgradeScore first, then retry." - ))); - } + // Install task ended without installing the package. + // Surface pkg's actual error output from the log field. + let log = s["log"].as_str().unwrap_or(""); + let tail: Vec<&str> = log + .lines() + .filter(|l| !l.trim().is_empty()) + .rev() + .take(8) + .collect::>() + .into_iter() + .rev() + .collect(); + let reason = if tail.is_empty() { + "(OPNsense returned no log output)".to_string() } else { - consecutive_idle = 0; - } + format!("Last OPNsense log output:\n{}", tail.join("\n")) + }; + return Err(Error::PackageInstall(format!( + "OPNsense install task for {package_name} ended without installing \ + the package.\n\n{reason}\n\nThis typically means the firmware needs \ + to be brought current — run OPNsenseFirmwareUpgradeScore first, \ + then retry." + ))); } Err(e) => { debug!( - "firmware/running poll attempt {attempt} returned transient error: {e}; retrying" + "firmware/upgradestatus poll attempt {attempt}: {e}; retrying (endpoint is documented unstable on OPNsense 26.1)" ); } } -- 2.39.5 From 51854e205cdebb0fe119849dbdadd520ccaa5ae0 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 07:59:56 -0400 Subject: [PATCH 18/38] feat(opnsense): OPNsensePackageInstallScore + linear pipeline in vm_integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every other operational primitive in harmony has a Score wrapper between the low-level call and the user-facing composition layer. Package installation didn't — `Config::install_package` was being called naked from the integration example with a hand-rolled `match install_package() { Ok=>… Err=>compose-firmware-upgrade-Score-and-retry }` glue. That's exactly the "imperative orchestration in the caller" pattern harmony's CLAUDE.md tells us to push into Scores. This commit: - Adds `OPNsensePackageInstallScore { packages: Vec }` in a new `harmony/src/modules/opnsense/package_install.rs`. The Interpret iterates packages, skips ones already installed via `is_package_installed`, calls `install_package` on the rest, surfaces newly-installed vs. already-present in `Outcome::success_with_details`. Idempotent on re-runs. - Adds the `OPNsensePackageInstall` variant to `InterpretName` + Display. - The Score deliberately has NO firmware-upgrade fallback baked in. If install fails because firmware is stale, `install_package`'s error message already points the operator at `OPNsenseFirmwareUpgradeScore`. Composition is the operator's job — same as every other Score pair relationship in harmony. - Rewrites `examples/opnsense_vm_integration::run_integration` to drop the ~40-line try/Err/retry block. The two new Scores (firmware upgrade + package install) are prepended to `build_all_scores`, so the pipeline becomes a linear vec: vec![ OPNsenseFirmwareUpgradeScore { mode: Auto, .. }, OPNsensePackageInstallScore { packages: vec!["os-haproxy"] }, webgui, lb, dhcp, … (existing config scores) ] Both `run_cli` invocations (run 1 and the idempotency run 2) exercise the new Scores. Both naturally NOOP on the second pass: upgrade because `firmware/status == "none"`, install because `is_package_installed("os-haproxy") == true`. Three unit tests in the new module cover Score name, serialization, and empty-package-list handling. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 79 +++----- harmony/src/domain/interpret/mod.rs | 2 + harmony/src/modules/opnsense/mod.rs | 1 + .../src/modules/opnsense/package_install.rs | 184 ++++++++++++++++++ 4 files changed, 214 insertions(+), 52 deletions(-) create mode 100644 harmony/src/modules/opnsense/package_install.rs diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 58175e41..497cc38d 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -36,9 +36,12 @@ use harmony::modules::opnsense::dnat::{DnatRuleDef, DnatScore}; use harmony::modules::opnsense::firewall::{ BinatRuleDef, BinatScore, FilterRuleDef, FirewallRuleScore, OutboundNatScore, SnatRuleDef, }; -use harmony::modules::opnsense::firmware_upgrade::FirmwareUpgradeMode; +use harmony::modules::opnsense::firmware_upgrade::{ + FirmwareUpgradeMode, OPNsenseFirmwareUpgradeScore, +}; use harmony::modules::opnsense::lagg::{LaggDef, LaggScore}; use harmony::modules::opnsense::node_exporter::NodeExporterScore; +use harmony::modules::opnsense::package_install::OPNsensePackageInstallScore; use harmony::modules::opnsense::vip::{VipDef, VipScore}; use harmony::modules::opnsense::vlan::{VlanDef, VlanScore}; use harmony::modules::tftp::TftpScore; @@ -55,7 +58,7 @@ use harmony_types::firewall::{ }; use harmony_types::id::Id; use harmony_types::net::{MacAddress, Url}; -use log::{info, warn}; +use log::info; const OPNSENSE_IMG_URL: &str = "https://mirror.ams1.nl.leaseweb.net/opnsense/releases/26.1/OPNsense-26.1-nano-amd64.img.bz2"; @@ -213,7 +216,8 @@ async fn boot_vm( // each integration run to spend 10+ minutes upgrading. The // operator can override to `Auto` / `AutoMinor` / `Prompt` // locally when testing the upgrade beat. - firmware_upgrade: FirmwareUpgradeMode::Disabled, + // firmware_upgrade: FirmwareUpgradeMode::Disabled, + firmware_upgrade: FirmwareUpgradeMode::Prompt, ..Default::default() })]; let bootstrap_args = harmony_cli::Args { @@ -276,56 +280,12 @@ async fn run_integration() -> Result<(), Box> { OPNSenseFirewall::with_api_port(firewall_host, None, OPN_API_PORT, &api_creds, &ssh_creds) .await; - // Install packages - let config = opnsense.get_opnsense_config(); - if !config.is_package_installed("os-haproxy").await { - info!("Installing os-haproxy (may need firmware update first)..."); - match config.install_package("os-haproxy").await { - Ok(()) => info!("os-haproxy installed"), - Err(e) => { - warn!("os-haproxy install failed: {e}"); - info!( - "Running OPNsenseFirmwareUpgradeScore (mode=Auto), then retrying \ - os-haproxy install ..." - ); - - // Compose the Score and dispatch via `harmony_cli::run_cli` - // — same orchestrator path the rest of run_integration - // uses for the integration-test scores below. Goes through - // the full Score → Interpret machinery (events, logging, - // progress). - let upgrade_score = - harmony::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore { - api_port: OPN_API_PORT, - mode: FirmwareUpgradeMode::Auto, - }; - let upgrade_scores: Vec>> = - vec![Box::new(upgrade_score)]; - let upgrade_args = harmony_cli::Args { - yes: true, - filter: None, - interactive: false, - all: true, - number: 0, - list: false, - }; - harmony_cli::run_cli( - Inventory::autoload(), - opnsense.clone(), - upgrade_scores, - upgrade_args, - ) - .await?; - - info!("Retrying os-haproxy install..."); - config.install_package("os-haproxy").await?; - } - } - } else { - info!("os-haproxy already installed"); - } - // ── Build and run all Scores ────────────────────────────────────── + // Pipeline starts with the firmware upgrade Score (brings the + // freshly-bootstrapped image current) and the package-install Score + // (installs os-haproxy now that the repo metadata is current). + // Everything downstream is configuration scores that depend on the + // plugin being installed. No imperative install/retry glue. info!("Running all Scores (run 1)..."); let scores = build_all_scores()?; let args = harmony_cli::Args { @@ -743,7 +703,22 @@ fn build_all_scores() -> Result>>, Box f.write_str("OPNSenseDns"), InterpretName::OPNsenseBootstrap => f.write_str("OPNsenseBootstrap"), InterpretName::OPNsenseFirmwareUpgrade => f.write_str("OPNsenseFirmwareUpgrade"), + InterpretName::OPNsensePackageInstall => f.write_str("OPNsensePackageInstall"), InterpretName::LoadBalancer => f.write_str("LoadBalancer"), InterpretName::Tftp => f.write_str("Tftp"), InterpretName::Http => f.write_str("Http"), diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index 47fd3c3d..0ee0eeba 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -6,6 +6,7 @@ pub mod firmware_upgrade; pub mod image; pub mod lagg; pub mod node_exporter; +pub mod package_install; mod shell; mod upgrade; pub mod vip; diff --git a/harmony/src/modules/opnsense/package_install.rs b/harmony/src/modules/opnsense/package_install.rs new file mode 100644 index 00000000..12db3624 --- /dev/null +++ b/harmony/src/modules/opnsense/package_install.rs @@ -0,0 +1,184 @@ +//! `OPNsensePackageInstallScore` — install one or more OPNsense plugin / +//! package via the REST API, idempotently. +//! +//! The Score is a thin wrapper around `opnsense_config::Config::install_package` +//! (the low-level method). It does two things on top of the bare call: +//! +//! 1. **Idempotency** — per package, skips the install when +//! `is_package_installed` already reports it present. +//! 2. **Score composition** — fits in a `Vec>>` +//! so operators can build linear pipelines instead of writing try/Err glue. +//! +//! Intentionally has **no** firmware-upgrade fallback. If the package fails to +//! install because the firmware is stale, the underlying `install_package` +//! returns a clear error that points the operator at +//! [`OPNsenseFirmwareUpgradeScore`](crate::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore). +//! Compose that Score earlier in your pipeline if you want firmware-current +//! before plugin installs: +//! +//! ```ignore +//! vec![ +//! Box::new(OPNsenseFirmwareUpgradeScore { mode: Auto, api_port: 9443 }), +//! Box::new(OPNsensePackageInstallScore { +//! packages: vec!["os-haproxy".into()], +//! }), +//! // ... other Score ... +//! ] +//! ``` + +use async_trait::async_trait; +use harmony_types::id::Id; +use log::info; +use serde::Serialize; + +use crate::{ + data::Version, + infra::opnsense::OPNSenseFirewall, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + score::Score, +}; + +/// Install one or more OPNsense packages / plugins (e.g. `os-haproxy`). +/// +/// See module-level docs. +#[derive(Debug, Clone, Serialize)] +pub struct OPNsensePackageInstallScore { + /// Package names to install, in order. + pub packages: Vec, +} + +impl Score for OPNsensePackageInstallScore { + fn name(&self) -> String { + "OPNsensePackageInstallScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OPNsensePackageInstallInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug)] +struct OPNsensePackageInstallInterpret { + score: OPNsensePackageInstallScore, +} + +#[async_trait] +impl Interpret for OPNsensePackageInstallInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &OPNSenseFirewall, + ) -> Result { + let firewall_ip = topology.get_ip().to_string(); + let tag = format!("[OPNsensePackageInstall/{firewall_ip}]"); + let config = topology.get_opnsense_config(); + + if self.score.packages.is_empty() { + info!("{tag} No packages requested; nothing to do"); + return Ok(Outcome::noop("No packages requested".to_string())); + } + + let mut already_installed: Vec = Vec::new(); + let mut newly_installed: Vec = Vec::new(); + + for pkg in &self.score.packages { + if config.is_package_installed(pkg).await { + info!("{tag} {pkg}: already installed; skipping"); + already_installed.push(pkg.clone()); + continue; + } + info!("{tag} Installing {pkg} ..."); + config.install_package(pkg).await.map_err(|e| { + InterpretError::new(format!( + "Failed to install OPNsense package '{pkg}' on {firewall_ip}: {e}" + )) + })?; + info!("{tag} {pkg}: installed successfully"); + newly_installed.push(pkg.clone()); + } + + let total = self.score.packages.len(); + let details = vec![ + format!( + "Newly installed ({}): {:?}", + newly_installed.len(), + newly_installed + ), + format!( + "Already installed, skipped ({}): {:?}", + already_installed.len(), + already_installed + ), + ]; + + if newly_installed.is_empty() { + Ok(Outcome::noop(format!( + "All {total} package(s) already installed on {firewall_ip}" + ))) + } else { + Ok(Outcome::success_with_details( + format!( + "Installed {} of {total} packages on {firewall_ip} ({} already present)", + newly_installed.len(), + already_installed.len(), + ), + details, + )) + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::OPNsensePackageInstall + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::QUEUED + } + + fn get_children(&self) -> Vec { + vec![] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_name() { + let s = OPNsensePackageInstallScore { + packages: vec!["os-haproxy".into()], + }; + assert_eq!( + >::name(&s), + "OPNsensePackageInstallScore" + ); + } + + #[test] + fn test_score_serializes() { + let s = OPNsensePackageInstallScore { + packages: vec!["os-haproxy".into(), "os-zerotier".into()], + }; + let _: serde_value::Value = + serde_value::to_value(&s).expect("OPNsensePackageInstallScore should serialize"); + } + + #[test] + fn test_empty_package_list_is_valid() { + let s = OPNsensePackageInstallScore { packages: vec![] }; + // Just confirm name + serialize still work with no packages. + assert_eq!( + >::name(&s), + "OPNsensePackageInstallScore" + ); + let _: serde_value::Value = serde_value::to_value(&s).unwrap(); + } +} -- 2.39.5 From 5f34fd5d35e829c26467b28cb1c09be1ddefaabf Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 08:09:55 -0400 Subject: [PATCH 19/38] fix(opnsense): drop redundant summary block from Prompt-mode confirmation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In FirmwareUpgradeMode::Prompt, the summary block was being printed twice — once via the `info!("{tag} Pending firmware …:\n{summary}")` line just above the mode-gating match, and again inside the inquire::Confirm prompt's header text. The prompt now asks only the yes/no question; the operator reads the summary from the info! log line one row above. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/firmware_upgrade.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 494c90c5..0a288222 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -286,8 +286,10 @@ pub async fn perform_firmware_upgrade( } } FirmwareUpgradeMode::Prompt => { - let header = format!("Apply this firmware {action_endpoint} on {firewall_ip}?"); - let prompt_text = format!("{header}\n{summary}\n"); + // Summary was already info!-logged just above; the prompt + // itself just asks the yes/no question. + let prompt_text = + format!("Apply this firmware {action_endpoint} on {firewall_ip}?"); let answer = inquire::Confirm::new(&prompt_text) .with_default(true) .prompt(); -- 2.39.5 From 27f18d601a3f529a91de829aebb3b8abd5107740 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 13 May 2026 08:14:35 -0400 Subject: [PATCH 20/38] refactor(opnsense): drop the wizard-abort call from OPNsenseBootstrapScore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The call to `OPNsenseBootstrap::abort_wizard()` (POST /api/core/initial_setup/abort) failed with 403 Forbidden on every run: that endpoint requires a session-CSRF token under cookie auth and we don't fetch one before calling it (only `login()` extracts a token, and it's tied to the login form). The 403 was logged as WARN and silently ignored — and empirically the wizard flag doesn't block ANY of the following steps (SSH enable, web GUI port change, API key mint via SSH, LAN rebind, firmware upgrade). So the call was producing log noise for no observable benefit. Drop the call from the Score's interpret flow. The `OPNsenseBootstrap::abort_wizard()` helper stays in the library — a future caller that wants to do it properly (GET an authenticated page, extract its CSRF token, include it in the abort POST) can still use it. Only downside: a human operator who later opens the WebUI manually will see the OPNsense first-run wizard prompt once and have to dismiss it. Acceptable trade for clean automated bootstrap logs. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/bootstrap_score.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 1c1b968a..dc43b204 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -238,11 +238,15 @@ impl Interpret for OPNsenseBootstrapInterpret { })?; info!("{tag} Logged in to web UI as {}", topology.default_username); - bootstrap - .abort_wizard() - .await - .map_err(|e| InterpretError::new(format!("Failed to abort setup wizard: {e}")))?; - info!("{tag} Aborted initial setup wizard"); + // Wizard-abort skipped: `POST /api/core/initial_setup/abort` + // requires a session-CSRF token we don't fetch (it returns 403 + // without it), AND empirically the wizard flag doesn't block any + // of the subsequent steps (SSH enable, port change, API key mint, + // LAN rebind). The only observable effect of leaving it set is + // that a human operator who later opens the WebUI manually will + // see the wizard prompt once. The helper `OPNsenseBootstrap:: + // abort_wizard()` is still available if a future caller wants to + // do it properly with CSRF. bootstrap .enable_ssh(true, true) -- 2.39.5 From 92717441b606800104b4572f6eceeb9621367124 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 06:55:23 -0400 Subject: [PATCH 21/38] =?UTF-8?q?refactor(opnsense):=20post-review=20clean?= =?UTF-8?q?up=20=E2=80=94=20named=20timeouts,=20shared=20upgradestatus=20h?= =?UTF-8?q?elper,=20doc=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - examples/opnsense_vm_integration: flip firmware_upgrade back to Disabled (the Score pipeline already runs OPNsenseFirmwareUpgradeScore explicitly, bootstrap-time upgrade was redundant); rewrite module docstring to match post-refactor behavior. - examples/opnsense_pair_integration: add TODO near abort_wizard noting the example should migrate to compose OPNsenseBootstrapScore. - harmony::modules::opnsense::firmware_upgrade: pull magic timeouts into named module-scope consts with one-line rationale; reuse the new shared check_firmware_task_done helper for upgradestatus polling. - opnsense-config: add check_firmware_task_done helper + name install_package's poll interval / max attempts; install_package now shares the helper. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../opnsense_pair_integration/src/main.rs | 12 ++ examples/opnsense_vm_integration/src/main.rs | 38 +++-- .../src/modules/opnsense/firmware_upgrade.rs | 76 ++++++---- opnsense-config/src/config/config.rs | 137 ++++++++++-------- opnsense-config/src/lib.rs | 1 + 5 files changed, 168 insertions(+), 96 deletions(-) diff --git a/examples/opnsense_pair_integration/src/main.rs b/examples/opnsense_pair_integration/src/main.rs index 381fcf52..307bf9ac 100644 --- a/examples/opnsense_pair_integration/src/main.rs +++ b/examples/opnsense_pair_integration/src/main.rs @@ -220,6 +220,18 @@ async fn boot_pair( async fn bootstrap_vm(role: &str, ip: &str) -> Result<(), Box> { info!("Bootstrapping {role} firewall at {ip}..."); + // TODO: migrate this example to compose `OPNsenseBootstrapScore` + // against `OPNsenseBootstrapTopology`, mirroring the + // `opnsense_vm_integration` refactor. That replaces this whole + // procedural dance (login → abort_wizard → enable_ssh → + // set_webgui_port → wait_for_ready → mint API key via SSH) with + // a single `harmony_cli::run_cli` invocation of the Score. The + // dual-firewall scenario will need per-instance secret keys + // (tracked at `harmony/src/domain/config/secret.rs:17`); migrate + // after that lands. Until then the `abort_wizard()` call below + // continues to 403 + WARN (same reason it was dropped from + // `OPNsenseBootstrapScore` in commit 27f18d60) — known-noisy, + // doesn't block any subsequent step. let bootstrap = OPNsenseBootstrap::new(&format!("https://{ip}")); bootstrap.login("root", "opnsense").await?; bootstrap.abort_wizard().await?; diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 497cc38d..4c967a4f 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -2,18 +2,28 @@ //! //! Fully unattended workflow — no manual browser interaction required: //! -//! 1. `--boot` — creates a KVM VM, waits for web UI, bootstraps SSH + webgui port -//! 2. (default run) — creates API key via SSH, installs packages, runs Scores -//! 3. `--full` — does both in a single invocation (CI-friendly) +//! 1. `--boot` — provisions a KVM VM (image inject, network, qcow2, +//! `virsh` define + start), then dispatches `OPNsenseBootstrapScore`: +//! login → SSH enable → web GUI port move to 9443 → API key mint → +//! persist `OPNSenseApiCredentials` + `OPNSenseFirewallCredentials` +//! to `harmony_secret::SecretManager`. +//! 2. (default run) — reads the stored credentials, runs the integration +//! Score pipeline against `OPNSenseFirewall`: +//! `OPNsenseFirmwareUpgradeScore` (brings firmware current) → +//! `OPNsensePackageInstallScore { os-haproxy }` → the config Scores +//! (web GUI port, load balancer, DHCP, TFTP, node exporter, VLAN, +//! firewall rules, SNAT/BINAT/VIP/DNAT, LAGG) → idempotency-rerun +//! of the same pipeline → entity-count assertions. +//! 3. `--full` — does both in a single invocation (CI-friendly). //! //! # Usage //! //! ```bash //! cargo run -p opnsense-vm-integration -- --check # verify prerequisites //! cargo run -p opnsense-vm-integration -- --download # download OPNsense image -//! cargo run -p opnsense-vm-integration -- --boot # create VM + automated bootstrap -//! cargo run -p opnsense-vm-integration # run integration test -//! cargo run -p opnsense-vm-integration -- --full # boot + bootstrap + test (CI mode) +//! cargo run -p opnsense-vm-integration -- --boot # create VM + run OPNsenseBootstrapScore +//! cargo run -p opnsense-vm-integration # run integration-test Score pipeline +//! cargo run -p opnsense-vm-integration -- --full # boot + bootstrap + pipeline (CI mode) //! cargo run -p opnsense-vm-integration -- --status # check VM state //! cargo run -p opnsense-vm-integration -- --clean # tear down everything //! ``` @@ -212,12 +222,14 @@ async fn boot_vm( let bootstrap_scores: Vec>> = vec![Box::new(OPNsenseBootstrapScore { target_api_port: OPN_API_PORT, - // The VM image is a known firmware version; we don't want - // each integration run to spend 10+ minutes upgrading. The - // operator can override to `Auto` / `AutoMinor` / `Prompt` - // locally when testing the upgrade beat. - // firmware_upgrade: FirmwareUpgradeMode::Disabled, - firmware_upgrade: FirmwareUpgradeMode::Prompt, + // The VM image is a known firmware version, and the + // integration-test Score pipeline (see `build_all_scores`) + // already runs `OPNsenseFirmwareUpgradeScore` explicitly + // before plugin installs. So we skip the bootstrap-time + // upgrade to avoid doing it twice. Operators can swap to + // `Auto` / `AutoMinor` / `Prompt` locally when testing the + // bootstrap upgrade beat specifically. + firmware_upgrade: FirmwareUpgradeMode::Disabled, ..Default::default() })]; let bootstrap_args = harmony_cli::Args { @@ -248,6 +260,8 @@ async fn boot_vm( println!("Or use --full to boot + test in one shot (CI mode):"); println!(" cargo run -p opnsense-vm-integration -- --full"); + todo!("stop here"); + Ok(()) } diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index 0a288222..c7e00ea1 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -42,11 +42,41 @@ use crate::{ const MAX_UPGRADE_ITERATIONS: u32 = 5; /// How long to wait for an async firmware task to report `"done"`. +/// Empirically 20 min covers a full 26.1 → 26.1.x upgrade including +/// package download, install, and reboot on a 2-vCPU / 2 GiB VM. const TASK_DONE_TIMEOUT: Duration = Duration::from_secs(1200); -/// How long to wait for the API to come back after a reboot. +/// How long to wait for the API to come back after a reboot. 10 min is +/// the same ceiling OPNsense's own WebUI uses. const REBOOT_RECOVERY_TIMEOUT: Duration = Duration::from_secs(600); +/// How long to wait for the metadata-refresh `firmware/check` task to +/// reach `done`. Distinct from the upgrade timeout: the check itself +/// is fast (download + parse the package index), 5 min is plenty. +const CHECK_TASK_TIMEOUT: Duration = Duration::from_secs(300); + +/// Time to let an async task spin up after we trigger it, before we +/// start polling status. Without this, the first poll often catches +/// `status == "none"` from the prior state (the new task hasn't +/// registered yet) and we mistakenly conclude there's nothing to do. +const POST_TRIGGER_SETTLE: Duration = Duration::from_secs(3); + +/// Interval between polls of `firmware/upgradestatus` and friends. +const POLL_INTERVAL: Duration = Duration::from_secs(5); + +/// Time the firewall is given to come back unreachable after we kick +/// an explicit `firmware/reboot`. Tight on purpose — the reboot was +/// just triggered; if the API stays up beyond this, something's wrong. +const REBOOT_UNREACHABLE_TIMEOUT: Duration = Duration::from_secs(60); + +/// Brief HTTPS probe timeout used inside the wait/probe loops. +const PROBE_TIMEOUT: Duration = Duration::from_secs(2); + +/// After the firewall comes back from a reboot the TLS handshake is +/// answering but `configd` and the MVC backend are still spinning up. +/// 30 s is empirically enough on a 2-vCPU VM. +const POST_REBOOT_SETTLE: Duration = Duration::from_secs(30); + /// How the firmware-upgrade helper decides whether (and how) to apply a /// pending update. /// @@ -188,7 +218,7 @@ pub async fn perform_firmware_upgrade( phase: "firmware/check", msg: e.to_string(), })?; - wait_for_task_done(client, "check", Duration::from_secs(300), tag).await?; + wait_for_task_done(client, "check", CHECK_TASK_TIMEOUT, tag).await?; // ── Step 3: read status to see what's actionable ───────────── let status: serde_json::Value = client @@ -507,7 +537,7 @@ async fn wait_for_task_done( let deadline = Instant::now() + timeout; let mut last_logged: Option = None; while Instant::now() < deadline { - tokio::time::sleep(Duration::from_secs(3)).await; + tokio::time::sleep(POST_TRIGGER_SETTLE).await; match client .get_typed::("core", "firmware", "upgradestatus") .await @@ -572,14 +602,13 @@ async fn wait_for_task_or_reboot( tag: &str, ) -> Result { const IDLE_THRESHOLD: u32 = 2; - let poll_interval = Duration::from_secs(5); + let poll_interval = POLL_INTERVAL; let deadline = Instant::now() + TASK_DONE_TIMEOUT; // No `mut rebooted` here: the reboot branch returns immediately with // rebooted=true, and the polling branches below only fire when no // reboot was observed. let mut consecutive_idle: u32 = 0; let mut last_running: Option = None; - let mut last_upgradestatus: Option = None; while Instant::now() < deadline { tokio::time::sleep(poll_interval).await; @@ -594,7 +623,7 @@ async fn wait_for_task_or_reboot( // don't bump product_version. The outer loop's next iteration // will trigger its own firmware/check and verify versions // explicitly — that's the real post-reboot completion signal. - if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + if !probe_https(firewall_ip, api_port, PROBE_TIMEOUT).await { info!("{tag} firmware/{task_label}: API unreachable — OPNsense is rebooting"); wait_for_reboot_cycle(firewall_ip, api_port, tag).await?; info!("{tag} firmware/{task_label}: reboot cycle complete; treating as task complete"); @@ -663,20 +692,14 @@ async fn wait_for_task_or_reboot( } // ── Signal C: upgradestatus reports "done" ────────────────── - // 404s ignored — known to be unstable on OPNsense 26.1. - if let Ok(s) = client - .get_typed::("core", "firmware", "upgradestatus") + // Shared helper centralizes the polling + 404-tolerance logic; + // `install_package` in opnsense-config uses the same primitive. + if opnsense_config::check_firmware_task_done(client) .await + .is_some() { - let st = s["status"].as_str().unwrap_or("").to_string(); - if st == "done" { - info!("{tag} firmware/{task_label}: upgradestatus reports done"); - return Ok(TaskOutcome { rebooted: false }); - } - if last_upgradestatus.as_deref() != Some(st.as_str()) { - debug!("{tag} firmware/upgradestatus: {st:?}"); - last_upgradestatus = Some(st); - } + info!("{tag} firmware/{task_label}: upgradestatus reports done"); + return Ok(TaskOutcome { rebooted: false }); } } @@ -695,10 +718,10 @@ async fn wait_for_reboot_cycle( tag: &str, ) -> Result<(), FirmwareUpgradeError> { info!("{tag} Waiting for the API to go unreachable (reboot in flight) ..."); - let unreach_deadline = Instant::now() + Duration::from_secs(60); + let unreach_deadline = Instant::now() + REBOOT_UNREACHABLE_TIMEOUT; while Instant::now() < unreach_deadline { - tokio::time::sleep(Duration::from_secs(2)).await; - if !probe_https(firewall_ip, api_port, Duration::from_secs(2)).await { + tokio::time::sleep(PROBE_TIMEOUT).await; + if !probe_https(firewall_ip, api_port, PROBE_TIMEOUT).await { info!("{tag} API unreachable — reboot in progress"); break; } @@ -708,8 +731,8 @@ async fn wait_for_reboot_cycle( let back_deadline = Instant::now() + REBOOT_RECOVERY_TIMEOUT; let mut came_back = false; while Instant::now() < back_deadline { - tokio::time::sleep(Duration::from_secs(5)).await; - if probe_https(firewall_ip, api_port, Duration::from_secs(5)).await { + tokio::time::sleep(POLL_INTERVAL).await; + if probe_https(firewall_ip, api_port, POLL_INTERVAL).await { came_back = true; break; } @@ -721,8 +744,11 @@ async fn wait_for_reboot_cycle( ))); } - info!("{tag} Web UI reachable; giving backend services 30s to settle ..."); - tokio::time::sleep(Duration::from_secs(30)).await; + info!( + "{tag} Web UI reachable; giving backend services {}s to settle ...", + POST_REBOOT_SETTLE.as_secs() + ); + tokio::time::sleep(POST_REBOOT_SETTLE).await; Ok(()) } diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 789b7e68..549484a8 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -33,6 +33,38 @@ struct InstallResponse { msg_uuid: String, } +/// Poll interval for `firmware/upgradestatus`-style task polling. +const FIRMWARE_TASK_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(3); + +/// Maximum attempts when polling `firmware/upgradestatus` for `"done"`. +/// 120 × 3 s = 6 min, an upper bound that's never hit in practice — the +/// install task either succeeds in seconds or fails in seconds (we surface +/// the failure via the `log` field). The ceiling guards against pathological +/// stuck-task cases. +const FIRMWARE_TASK_MAX_ATTEMPTS: u32 = 120; + +/// Single-shot probe of `/api/core/firmware/upgradestatus`. +/// +/// Returns `Some(status_json)` only when the endpoint reports +/// `status == "done"` (the task has finished). Returns `None` for every +/// other case — task still running, transient 404 (the endpoint is +/// documented as "known to be unstable" on OPNsense 26.1.6 and reliably +/// 404s when no task is registered), or any other error. +/// +/// Callers loop around this with their own timeout / interval, and +/// inspect the returned JSON (notably the `log` field) when `Some` is +/// returned. See `Config::install_package` and +/// `harmony::modules::opnsense::firmware_upgrade::wait_for_task_or_reboot`. +pub async fn check_firmware_task_done(client: &OpnsenseClient) -> Option { + match client + .get_typed::("core", "firmware", "upgradestatus") + .await + { + Ok(s) if s["status"].as_str() == Some("done") => Some(s), + _ => None, + } +} + impl Config { /// Create a new Config from an existing API client and SSH shell. pub fn new(client: OpnsenseClient, shell: Arc) -> Self { @@ -211,72 +243,59 @@ impl Config { resp.msg_uuid ); - let poll_interval = std::time::Duration::from_secs(3); - let max_attempts = 120; // 6 minutes — safety ceiling - for attempt in 0..max_attempts { - tokio::time::sleep(poll_interval).await; - match self + for _attempt in 0..FIRMWARE_TASK_MAX_ATTEMPTS { + tokio::time::sleep(FIRMWARE_TASK_POLL_INTERVAL).await; + let Some(status_json) = check_firmware_task_done(&self.client).await else { + continue; + }; + + // Task ended. Did it install the package? + let info: serde_json::Value = self .client - .get_typed::("core", "firmware", "upgradestatus") + .get_typed("core", "firmware", "info") .await - { - Ok(s) => { - if s["status"].as_str() != Some("done") { - continue; - } - // Task ended. Did it install the package? - let info: serde_json::Value = self - .client - .get_typed("core", "firmware", "info") - .await - .map_err(Error::Api)?; - let installed = info["package"] - .as_array() - .and_then(|pkgs| { - pkgs.iter() - .find(|p| p["name"].as_str() == Some(package_name)) - }) - .and_then(|p| p["installed"].as_str()) - == Some("1"); - if installed { - info!("Package {package_name} installed successfully"); - return Ok(()); - } - // Install task ended without installing the package. - // Surface pkg's actual error output from the log field. - let log = s["log"].as_str().unwrap_or(""); - let tail: Vec<&str> = log - .lines() - .filter(|l| !l.trim().is_empty()) - .rev() - .take(8) - .collect::>() - .into_iter() - .rev() - .collect(); - let reason = if tail.is_empty() { - "(OPNsense returned no log output)".to_string() - } else { - format!("Last OPNsense log output:\n{}", tail.join("\n")) - }; - return Err(Error::PackageInstall(format!( - "OPNsense install task for {package_name} ended without installing \ - the package.\n\n{reason}\n\nThis typically means the firmware needs \ - to be brought current — run OPNsenseFirmwareUpgradeScore first, \ - then retry." - ))); - } - Err(e) => { - debug!( - "firmware/upgradestatus poll attempt {attempt}: {e}; retrying (endpoint is documented unstable on OPNsense 26.1)" - ); - } + .map_err(Error::Api)?; + let installed = info["package"] + .as_array() + .and_then(|pkgs| { + pkgs.iter() + .find(|p| p["name"].as_str() == Some(package_name)) + }) + .and_then(|p| p["installed"].as_str()) + == Some("1"); + if installed { + info!("Package {package_name} installed successfully"); + return Ok(()); } + + // Install task ended without installing the package. Surface + // pkg's actual error output from the `log` field. + let log = status_json["log"].as_str().unwrap_or(""); + let tail: Vec<&str> = log + .lines() + .filter(|l| !l.trim().is_empty()) + .rev() + .take(8) + .collect::>() + .into_iter() + .rev() + .collect(); + let reason = if tail.is_empty() { + "(OPNsense returned no log output)".to_string() + } else { + format!("Last OPNsense log output:\n{}", tail.join("\n")) + }; + return Err(Error::PackageInstall(format!( + "OPNsense install task for {package_name} ended without installing \ + the package.\n\n{reason}\n\nThis typically means the firmware needs \ + to be brought current — run OPNsenseFirmwareUpgradeScore first, \ + then retry." + ))); } let msg = format!( "Package {package_name} did not appear as installed within {} seconds", - max_attempts * poll_interval.as_secs() + FIRMWARE_TASK_MAX_ATTEMPTS as u64 * FIRMWARE_TASK_POLL_INTERVAL.as_secs() ); warn!("{msg}"); Err(Error::PackageInstall(msg)) diff --git a/opnsense-config/src/lib.rs b/opnsense-config/src/lib.rs index 47ddb768..966a0fe7 100644 --- a/opnsense-config/src/lib.rs +++ b/opnsense-config/src/lib.rs @@ -2,5 +2,6 @@ pub mod config; pub mod error; pub mod modules; +pub use config::check_firmware_task_done; pub use config::Config; pub use error::Error; -- 2.39.5 From d9c9ffc6fa2a706d9a5c5b3c673187185054b2e8 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 07:48:44 -0400 Subject: [PATCH 22/38] fix(opnsense-vm-integration): drop stray todo!("stop here") in print_setup Leaked into commit 92717441 when an uncommitted debug breadcrumb in the user's working tree was staged alongside the post-review cleanup. `--setup` would panic at runtime before printing the closing newline. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index 4c967a4f..f9a18863 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -260,8 +260,6 @@ async fn boot_vm( println!("Or use --full to boot + test in one shot (CI mode):"); println!(" cargo run -p opnsense-vm-integration -- --full"); - todo!("stop here"); - Ok(()) } -- 2.39.5 From 9eeede18b8412ac20ff183aaca281cc990d33b94 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 10:49:59 -0400 Subject: [PATCH 23/38] feat(opnsense): pin physical NIC names to MAC addresses via vendored ethname MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On multi-NIC FreeBSD/OPNsense boxes (Wize 5070 and similar), PCIe enumeration order shuffles igc0/igc1/... across reboots. OPNsense binds wan/lan assignments to interface names, so a shuffle silently re-points them at the wrong physical ports and breaks firewall rules. Validated fix from OPNsense forum #27023 (endorsed by franco): the upstream `ethname` rc.d script (MIT, © Eric Borisch 2016–2019, frozen at v2.0.1) does a two-stage rename in early boot — before `netif` — mapping MACs to fixed interface names. Vendor the 280-line script inline rather than `pkg install ethname`. `pkg install` on a fresh ISO often fails because the firmware lags the live pkg repo, and the firmware-upgrade reboot is precisely the boot we need to defend against. Vendoring sidesteps the chicken-and-egg. Adds: harmony/data/opnsense/ethname.sh vendored upstream script (verbatim) harmony/data/opnsense/ethname.LICENSE preserves MIT terms bootstrap.rs: ETHNAME_SCRIPT (const, include_str!) DEFAULT_PHYSICAL_DRIVER_PREFIXES (const) list_physical_nics_via_ssh / read_ethname_mac_set_via_ssh / install_ethname_via_ssh (pub SSH helpers) pin_nic_names module: pin_nic_names_step — the shared one-shot logic OPNsensePinNicNamesScore — Score for ad-hoc re-pinning / standalone use OPNsenseBootstrapScore composes pin_nic_names_step internally as a mandatory step between the web UI dance and API key mint — every firewall bootstrapped through harmony gets pinned NIC names automatically, no caller code change required. Idempotent: re-running on a firewall whose MAC set already matches /etc/rc.conf.d/ethname is a NOOP. The existence probe for the config file is wrapped in `sh -c '...'` because OPNsense's root login shell is /bin/csh (tcsh); bare Bourne if/then/else fails there. Simple `&&` chains (the pattern in the other SSH helpers) work in both shells. Co-Authored-By: Claude Opus 4.7 (1M context) --- data/opnsense/ethname.LICENSE | 21 ++ data/opnsense/ethname.sh | 280 +++++++++++++++ harmony/src/domain/interpret/mod.rs | 2 + harmony/src/modules/opnsense/bootstrap.rs | 263 ++++++++++++++ .../src/modules/opnsense/bootstrap_score.rs | 31 +- harmony/src/modules/opnsense/mod.rs | 1 + harmony/src/modules/opnsense/pin_nic_names.rs | 334 ++++++++++++++++++ 7 files changed, 928 insertions(+), 4 deletions(-) create mode 100644 data/opnsense/ethname.LICENSE create mode 100644 data/opnsense/ethname.sh create mode 100644 harmony/src/modules/opnsense/pin_nic_names.rs diff --git a/data/opnsense/ethname.LICENSE b/data/opnsense/ethname.LICENSE new file mode 100644 index 00000000..853b46db --- /dev/null +++ b/data/opnsense/ethname.LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/data/opnsense/ethname.sh b/data/opnsense/ethname.sh new file mode 100644 index 00000000..7489b690 --- /dev/null +++ b/data/opnsense/ethname.sh @@ -0,0 +1,280 @@ +#!/bin/sh +# +# * Copyright (c) 2016-2019 Eric Borisch +# * All rights reserved. +# +# Self-contained rc.d script for re-naming devices based on their MAC address. +# Renaming is performed before interface bring-up -- netif -- so all +# configurations of the devices can be done with the new names. +# +# USAGE: +# 1) Add the following to rc.conf: +# ethname_enable="YES" +# ethname_external_mac="aa:bb:cc:dd:ee:00" +# ethname_private_mac="aa:bb:cc:dd:ee:01" +# 1a) You can optionally restrict handling to a set of defined names with: +# ethname_names="external private" +# otherwise all defined ethname_*_mac="" values are used +# 2) Make sure any interfaces you want to rename have their drivers loaded or +# compiled in. If ue0 is on axe0, for example, add 'if_load_axe="YES"' to +# /boot/loader.conf. See the man page for your device (eg 'man axe') for +# particulars. +# 3) That's it. Use ifconfig_="" settings with the new names. +# +# All other devices are untouched. +# +# Optional rc.conf settings: +# ethname_timeout : Maximum wait time for devices to appear. [default=30] +# +# PROVIDE: ethname +# REQUIRE: FILESYSTEMS +# BEFORE: netif +# KEYWORD: nojail + +# ethname version 2.0 + +. /etc/rc.subr + +name=ethname +rcvar=ethname_enable +extra_commands="check" +check_cmd="en_check" + +start_cmd="${name}_start" +stop_cmd=":" + +load_rc_config ${name} +: ${ethname_names:=""} +: ${ethname_enable:=no} +: ${ethname_timeout:="30"} + +en_str="" + +# Will fill with mac interface [mac interface] ...] +en_map="" + +# Will fill with original device names that match a managed mac address. +en_orig="" + +# Total wait timeout; won't wait n*timeout for n devices, just timeout +en_waited=0 + +known_mac() +{ + echo "${en_map}" | grep -qi "$1" +} + +to_lower() +{ + echo "$*" | tr "[:upper:]" "[:lower:]" +} + + +kv_lookup() +{ + # Called with $1=K, the key we want to find the value for, and $2:$3 + # $4:$5 ... forming pairs of key:value mappings + local _K _key _value + + _K=$(to_lower "$1") + [ -z "${_K}" ] && err 1 "Called kv_lookup() with missing args." + shift + while [ $# -ge 2 ]; do + _key=$(to_lower "$1") + _value=$2 + shift 2 + # Only supports non-zero-length keys/values + [ -z "${_key}" -o -z "${_value}" ] && err 1 "Zero length values passed?" + [ "${_key}" == "${_K}" ] && echo "${_value}" && return 0 + done + return 1 +} + +good_mac() { + echo "$1" | egrep -qi '^([0-9a-z]{2}:){5}[0-9a-z]{2}$' || \ + err 1 "Invalid MAC address defined: [$1]" + return 0 +} + +good_devname() { + echo "$1" | egrep -qi '^[a-z][a-z0-9_]+$' || \ + err 1 "Invalid device name defined: [$1]" + return 0 +} + +breakout_map () { + # This takes a single ethname_map variable (old interface) and breaks it + # into the new interface (ethname_names and ethname_NAME_mac vars.) + local _mac _name + while [ $# -gt 0 ]; do + _mac=$1 + _name=$2 + good_mac "${_mac}" + good_devname "${_name}" + shift 2 + # Params checked for validity above + eval ethname_${_name}_mac="${_mac}" + ethname_names="${ethname_names} ${_name}" + done +} + +en_prep() +{ + local _mac _name _dev _found + local _compat=0 + + if [ -z "${ethname_names}" ]; then + # Compatibility code + if [ ! -z "${ethname_map}" -a ! -z "${ethname_devices}" ]; then + ethname_names="" + warn "ethname: Using old interface. Please see documentation." + breakout_map ${ethname_map} + _compat=1 + else + # Detect set ethname_*_mac names + ethname_names=$(set | sed -En '/^ethname_([^=]+)_mac=.*/s//\1/p') + fi + fi + + # Transforms set of ethname_NAME_mac="" values into en_map="MAC NAME ..." + # and en_orig="EXISTINGDEV ..."; a map of desired MAC:name mappings + # and the devices with those MACs, respectively. + + for _name in ${ethname_names}; do + # Make sure ${_name} is good before eval call + good_devname "${_name}" + eval _mac=\$ethname_${_name}_mac + + [ -z "${_mac}" -a ${_compat} -eq 0 ] && \ + warn "ethname_${_name}_mac is not set in rc.conf!" && continue + + good_mac "${_mac}" + + # Enable ctrl-c for wait loop + trap break SIGINT + + _found=0 + while [ ${en_waited} -lt ${ethname_timeout} ]; do + for _dev in $(ifconfig -l ether); do + if ifconfig ${_dev} | grep -qi "${_mac}"; then + en_map="${en_map} ${_mac} ${_name}" + en_orig="${en_orig} ${_dev}" + _found=1 + break + fi + done + [ ${_found} -eq 1 ] && break + sleep 1 + warn "Waiting for a device with MAC [${_mac}] to appear..." + en_waited=$((en_waited + 1)) + done + + trap - SIGINT + + [ ${_found} -eq 0 ] && \ + warn "Unable to locate device to rename [${_name}]!" + done +} + +en_check() { + local _mac _name _orig + local _n=1 + en_prep + # Piping into a while loop, but we don't need any results from this loop to + # be visible in this shell, so it's not an issue. + echo "${en_map}" | xargs -n 2 echo | while read _mac _name; do + _orig=$(echo "${en_orig}" | awk "{print \$${_n}}") + if [ "${_orig}" = "${_name}" ]; then + printf "Device with MAC [%s] already named '%s'\n" \ + "${_mac}" "${_name}" + else + printf "Will rename [%s] to [%s] with MAC [%s]\n" \ + "${_orig}" "${_name}" "${_mac}" + fi + _n=$((_n + 1)) + done +} + +fix_name() +{ + # Can be called with or without a second argument (which is used as the new + # name if provided.) If only one argument, lookup desired name in map. + dev=$1 + name=$2 + + # Make sure the device exists as an ifconfig device + if ! ifconfig -l ether | grep -q "${dev}"; then + en_str="could not find device." + return 1 + fi + + # Grab MAC address + mac=$(ifconfig ${dev} | awk '/ether/{print tolower($2)}') + + if [ ${#mac} -eq 0 ]; then + en_str="unable to get MAC address" + return 1 + fi + + # Make sure the MAC for this device is in our rename table. + if ! known_mac "${mac}"; then + en_str="no maching MAC in ethname__mac params." + return 1 + fi + + # Find name from MAC -> dev_name table in map + dname=$(kv_lookup ${mac} ${en_map}) + if [ "${dname}" == "${dev}" ]; then + en_str="already has desired name." + return 1 + fi + + # Use name from MAC -> dev_name table in map if $2 was empty + : ${name:=${dname}} + + # We have everything we need. Now actual rename of the device. + if ! ifconfig ${dev} name ${name} > /dev/null ; then + en_str="return code: $?" + return 2 + fi +} + +ethname_start() +{ + local _n _m _prefix _x + # Build the map of "mac name [mac name] [...]" + en_prep + + # Don't report any other errors if we haven't been asked to do anything. + if [ ${#en_orig} -eq 0 ]; then + warn "Unable to locate any of the specified ethname_\*_mac addresses." + exit 0 + fi + + # Rename interfaces; first into en_tmp_$_n with _n = 0, 1, ... to avoid any + # possible collision with the desired names. (ex. ue0 -> ue1; ue1 -> ue0 + # renaming.) + _prefix=en_$$_ + _n=0 + for _x in ${en_orig}; do + if fix_name ${_x} ${_prefix}${_n}; then + _n=$((_n+1)) + elif [ $? -eq 1 ]; then + info "Skipping rename of [${_x}]: ${en_str}" + else + warn "Error during rename of [${_x}]: ${en_str}" + fi + done + + # Loop back over renamed devices and lookup their desired names. + _m=0 + while [ ${_m} -lt ${_n} ]; do + fix_name ${_prefix}${_m} || \ + warn "Error during renaming process. Stranded [${_prefix}${_m}]." + _m=$((_m+1)) + done +} + +run_rc_command "$1" + +# vim: et:ts=4:sw=4 diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index 6cb30669..0ddbfa19 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -14,6 +14,7 @@ pub enum InterpretName { OPNsenseBootstrap, OPNsenseFirmwareUpgrade, OPNsensePackageInstall, + OPNsensePinNicNames, LoadBalancer, Tftp, Http, @@ -50,6 +51,7 @@ impl std::fmt::Display for InterpretName { InterpretName::OPNsenseBootstrap => f.write_str("OPNsenseBootstrap"), InterpretName::OPNsenseFirmwareUpgrade => f.write_str("OPNsenseFirmwareUpgrade"), InterpretName::OPNsensePackageInstall => f.write_str("OPNsensePackageInstall"), + InterpretName::OPNsensePinNicNames => f.write_str("OPNsensePinNicNames"), InterpretName::LoadBalancer => f.write_str("LoadBalancer"), InterpretName::Tftp => f.write_str("Tftp"), InterpretName::Http => f.write_str("Http"), diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index 25de1c45..cf6ad5f3 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -614,6 +614,269 @@ echo "OK\n"; Ok(()) } +/// The vendored upstream `ethname` rc.d script (MIT, © Eric Borisch +/// 2016–2019, frozen since v2.0.1 in March 2020). The Score +/// `OPNsenseBootstrapScore` SFTPs this onto every firewall it +/// bootstraps so that NIC names get pinned to MAC addresses before +/// any reboot. +/// +/// License text in `harmony/data/opnsense/ethname.LICENSE`. Vendored +/// instead of `pkg install`'d because `pkg install` on a fresh ISO +/// often fails — the firmware lags the live pkg repo, and the +/// firmware-upgrade reboot is precisely the boot we need to defend +/// against, so we cannot run firmware upgrade first. +pub const ETHNAME_SCRIPT: &str = include_str!("../../../../data/opnsense/ethname.sh"); + +/// Driver names whose interfaces are physical NICs worth pinning. +/// Pseudo-interfaces (`lagg`, `vlan`, `bridge`, `pflog`, ...) are +/// excluded by *not* appearing here. Names from `ifconfig -l ether` +/// are matched against this list after stripping the trailing numeric +/// suffix, so `igc0`/`igc1`/`igc2` all match `"igc"`. +pub const DEFAULT_PHYSICAL_DRIVER_PREFIXES: &[&str] = &[ + // PCIe Intel + "igb", + "igc", + "em", + "ix", + "ixl", + "ice", + // PCIe Realtek / Broadcom / SysKonnect / Intel legacy + "re", + "bge", + "msk", + "fxp", + // Virtio / VMware / Hyper-V + "vtnet", + "vmx", + // USB ethernet (works only if the driver loads in early boot; + // see ethname forum thread for `if_*_load=YES` workaround.) + "axge", + "axe", + "aue", + // Mellanox / Amazon ENA + "mlx5_core", + "ena", +]; + +/// Enumerate physical NICs (name, MAC) over SSH, filtered by driver prefix. +/// +/// Runs `ifconfig -l ether` on the firewall to list ethernet-typed +/// interfaces, then for each candidate, parses the `ether` line out of +/// `ifconfig `. The MAC is normalised to lowercase. +/// +/// The driver-prefix filter keeps physical NICs (igb, igc, em, ...) and +/// drops pseudo-interfaces (lagg, vlan, bridge, vlan-tagged children, +/// pflog, etc.). Matching is on the interface name with trailing digits +/// stripped, so `igc0`/`igc1`/`igc2` all match `"igc"` exactly. +pub async fn list_physical_nics_via_ssh( + ip: &std::net::IpAddr, + username: &str, + password: &str, + driver_prefixes: &[&str], +) -> Result, BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + let shell = opnsense_ssh_shell(*ip, username, password); + + let names_out = shell.exec("ifconfig -l ether").await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!("ifconfig -l ether failed: {e}")) + })?; + + let mut pairs: Vec<(String, String)> = Vec::new(); + for name in names_out.split_whitespace() { + let driver = name.trim_end_matches(|c: char| c.is_ascii_digit()); + if !driver_prefixes.iter().any(|p| *p == driver) { + continue; + } + + let out = shell.exec(&format!("ifconfig {name}")).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!("ifconfig {name} failed: {e}")) + })?; + + let mac = out.lines().find_map(|line| { + let trimmed = line.trim_start(); + trimmed + .strip_prefix("ether ") + .and_then(|rest| rest.split_whitespace().next()) + .map(|m| m.to_lowercase()) + }); + + match mac { + Some(m) if !m.is_empty() => pairs.push((name.to_string(), m)), + _ => warn!("ifconfig {name}: no ether line; skipping"), + } + } + + Ok(pairs) +} + +/// Read `/etc/rc.conf.d/ethname` and return the set of MAC addresses pinned in it. +/// +/// Returns `Ok(None)` if the file does not exist (fresh firewall, never +/// pinned). Returns `Ok(Some(set))` if it does — the set contains every +/// MAC referenced by an `ethname__mac="..."` line, lowercased. +/// +/// Used by `OPNsenseBootstrapScore` to NOOP the pin step when the +/// file's MAC set already equals the live MAC set discovered via +/// [`list_physical_nics_via_ssh`]. +pub async fn read_ethname_mac_set_via_ssh( + ip: &std::net::IpAddr, + username: &str, + password: &str, +) -> Result>, BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + let shell = opnsense_ssh_shell(*ip, username, password); + + // Use a sentinel so we can distinguish "file missing" from "exec error". + // + // Wrapped in `sh -c '...'` because OPNsense's root login shell is + // `/bin/csh` (tcsh) — Bourne `if/then/else/fi` is a syntax error + // there. Simple `&&`/`||` chains work in tcsh, but full conditionals + // need an explicit /bin/sh. + let out = shell + .exec( + "sh -c 'if [ -f /etc/rc.conf.d/ethname ]; then \ + cat /etc/rc.conf.d/ethname; \ + else \ + echo __ETHNAME_FILE_MISSING__; \ + fi'", + ) + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!("read /etc/rc.conf.d/ethname failed: {e}")) + })?; + + if out.trim() == "__ETHNAME_FILE_MISSING__" { + return Ok(None); + } + + let mut macs = std::collections::BTreeSet::new(); + for line in out.lines() { + let line = line.trim(); + let Some(rest) = line.strip_prefix("ethname_") else { + continue; + }; + let Some(eq) = rest.find('=') else { continue }; + let key = &rest[..eq]; + if !key.ends_with("_mac") { + continue; + } + let val = rest[eq + 1..].trim().trim_matches('"'); + if !val.is_empty() { + macs.insert(val.to_lowercase()); + } + } + + Ok(Some(macs)) +} + +/// SFTP the vendored ethname rc.d script onto the firewall and write the +/// matching `/etc/rc.conf.d/ethname` + early syshook so it activates on +/// next boot. +/// +/// `ethname_script` is the verbatim 280-line shell script (vendored in +/// `harmony/data/opnsense/ethname.sh`, MIT, © Eric Borisch 2016–2019), +/// embedded into the caller via `include_str!`. `pairs` is the list of +/// `(name, MAC)` to pin — typically the output of +/// [`list_physical_nics_via_ssh`]. +/// +/// Three files land on the firewall: +/// +/// * `/usr/local/etc/rc.d/ethname` (0755) — the rename script. +/// * `/etc/rc.conf.d/ethname` — `ethname_enable="NO"` + one +/// `ethname__mac="..."` line per pin. +/// * `/usr/local/etc/rc.syshook.d/early/02-ethname` (0755) — early hook +/// that calls `ethname onestart` before `netif` (the upstream rc.d +/// ordering runs ethname too late for OPNsense — it needs to happen +/// before any interface comes up). +/// +/// `ethname_enable="NO"` is intentional: the early syshook calls +/// `onestart` explicitly, so the regular rc.d enable would cause double +/// execution and a confusing second pass. +pub async fn install_ethname_via_ssh( + ip: &std::net::IpAddr, + username: &str, + password: &str, + ethname_script: &str, + pairs: &[(String, String)], +) -> Result<(), BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + let shell = opnsense_ssh_shell(*ip, username, password); + + // 1. The script itself. + info!( + "ethname install (a/c): SFTP rc.d/ethname ({} bytes, MIT, vendored upstream)", + ethname_script.len() + ); + shell + .write_content_to_file(ethname_script, "/usr/local/etc/rc.d/ethname") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP ethname.sh failed: {e}")))?; + shell + .exec("chmod 0755 /usr/local/etc/rc.d/ethname") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("chmod ethname.sh failed: {e}")))?; + + // 2. The mapping file. + info!( + "ethname install (b/c): writing /etc/rc.conf.d/ethname ({} mapping(s))", + pairs.len() + ); + let mut conf = String::from("ethname_enable=\"NO\"\nethname_timeout=30\n"); + for (name, mac) in pairs { + // Both fields come from this firewall's own `ifconfig` output + // moments earlier — trusted. A defensive sanity check guards + // against pathological output (spaces, quotes, command injection + // via $(...) in a name) that would corrupt the conf file. + if !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') { + return Err(BootstrapError::UnexpectedResponse(format!( + "Refusing to write rc.conf.d/ethname: NIC name {name:?} contains \ + unexpected characters" + ))); + } + if !mac.chars().all(|c| c.is_ascii_hexdigit() || c == ':') { + return Err(BootstrapError::UnexpectedResponse(format!( + "Refusing to write rc.conf.d/ethname: MAC {mac:?} not in xx:xx:xx:xx:xx:xx form" + ))); + } + conf.push_str(&format!("ethname_{name}_mac=\"{mac}\"\n")); + } + shell + .write_content_to_file(&conf, "/etc/rc.conf.d/ethname") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!("SFTP /etc/rc.conf.d/ethname failed: {e}")) + })?; + + // 3. The early syshook. + info!( + "ethname install (c/c): writing early-boot syshook /usr/local/etc/rc.syshook.d/early/02-ethname" + ); + shell + .exec("mkdir -p /usr/local/etc/rc.syshook.d/early") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!("mkdir rc.syshook.d/early failed: {e}")) + })?; + let hook = "#!/bin/sh\n/usr/local/etc/rc.d/ethname onestart\n"; + shell + .write_content_to_file(hook, "/usr/local/etc/rc.syshook.d/early/02-ethname") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!("SFTP 02-ethname syshook failed: {e}")) + })?; + shell + .exec("chmod 0755 /usr/local/etc/rc.syshook.d/early/02-ethname") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!("chmod 02-ethname syshook failed: {e}")) + })?; + + Ok(()) +} + /// Extract the CSRF token field name and value from an OPNsense HTML page. /// /// OPNsense embeds CSRF tokens as hidden inputs with a dynamic field name. diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index dc43b204..e648457c 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -7,15 +7,20 @@ //! //! 1. Logs into the web UI, aborts the initial setup wizard, enables SSH. //! 2. Moves the web GUI from port 443 to `target_api_port`. -//! 3. SSHes in, mints an API key + secret on the root user, and persists +//! 3. **Pins physical NIC names to MAC addresses** via the vendored +//! `ethname` rc.d script (MIT). Mandatory step — without it, the +//! firmware-upgrade reboot below can shuffle `igc0/igc1/...` and +//! silently re-point wan/lan at the wrong cables. Idempotent and +//! harmless on single-NIC VMs. +//! 4. SSHes in, mints an API key + secret on the root user, and persists //! both `OPNSenseApiCredentials` and `OPNSenseFirewallCredentials` to //! `harmony_secret::SecretManager`. -//! 4. (Default-on, via `firmware_upgrade`) Brings the firewall up to the +//! 5. (Default-on, via `firmware_upgrade`) Brings the firewall up to the //! latest firmware/package level using the same logic as //! [`OPNsenseFirmwareUpgradeScore`](crate::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore). //! Configurable via `FirmwareUpgradeMode` (Auto / AutoMinor / Prompt / //! Disabled). -//! 5. Optionally rebinds the LAN to a new IP/subnet. +//! 6. Optionally rebinds the LAN to a new IP/subnet. //! //! After it runs, callers construct a normal //! [`OPNSenseFirewall`](crate::infra::opnsense::OPNSenseFirewall) from the @@ -42,9 +47,11 @@ use crate::{ interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, inventory::Inventory, modules::opnsense::bootstrap::{ - OPNsenseBootstrap, change_lan_ip_via_ssh, create_api_key_ssh, probe_https, + DEFAULT_PHYSICAL_DRIVER_PREFIXES, OPNsenseBootstrap, change_lan_ip_via_ssh, + create_api_key_ssh, probe_https, }, modules::opnsense::firmware_upgrade::{FirmwareUpgradeMode, perform_firmware_upgrade}, + modules::opnsense::pin_nic_names::pin_nic_names_step, score::Score, topology::OPNsenseBootstrapTopology, }; @@ -278,6 +285,22 @@ impl Interpret for OPNsenseBootstrapInterpret { })?; info!("{tag} Web UI ready at {new_url}"); + // ── Step 2.5: pin NIC names to MAC addresses ───────────────── + // Mandatory built-in step. Shared with the standalone + // `OPNsensePinNicNamesScore` via `pin_nic_names_step`. Pins + // every physical NIC's name to its MAC *before* the + // firmware-upgrade reboot below — that's the first reboot + // the pinning has to defend against. Harmless on single-NIC + // VMs (one pin, no shuffle ever). + let _ = pin_nic_names_step( + &topology.vanilla_ip, + &topology.default_username, + &topology.default_password, + DEFAULT_PHYSICAL_DRIVER_PREFIXES, + &tag, + ) + .await?; + // ── Step 3: mint API key & persist secrets ─────────────────── // Persist BEFORE the LAN flip — if the LAN flip fails mid-execution, // the operator can re-run; the dance branch picks up at "creds present, diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index 0ee0eeba..c3b65793 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -7,6 +7,7 @@ pub mod image; pub mod lagg; pub mod node_exporter; pub mod package_install; +pub mod pin_nic_names; mod shell; mod upgrade; pub mod vip; diff --git a/harmony/src/modules/opnsense/pin_nic_names.rs b/harmony/src/modules/opnsense/pin_nic_names.rs new file mode 100644 index 00000000..194b5f8a --- /dev/null +++ b/harmony/src/modules/opnsense/pin_nic_names.rs @@ -0,0 +1,334 @@ +//! `OPNsensePinNicNamesScore` — pin physical NIC names to MAC addresses. +//! +//! On multi-NIC FreeBSD/OPNsense boxes (e.g. Wize 5070), PCIe/driver +//! enumeration order at boot is non-deterministic. `igc0/igc1/igc2/...` +//! shuffle between reboots, and OPNsense's logical `wan`/`lan` +//! assignments — bound to interface *names* — silently re-point at +//! whatever physical port that name happens to be on a given boot. +//! Firewall rules then apply to the wrong cables. +//! +//! The validated fix from OPNsense forum topic #27023 (endorsed by +//! franco) is the `ethname` rc.d script — a 280-line POSIX shell +//! script (MIT, © Eric Borisch 2016–2019, frozen since v2.0.1 in +//! March 2020) that performs a two-stage interface rename in early +//! boot, before `netif`. +//! +//! This module vendors `ethname` inline (see +//! [`crate::modules::opnsense::bootstrap::ETHNAME_SCRIPT`]) rather +//! than relying on `pkg install ethname` — `pkg install` on a fresh +//! ISO often fails because the firmware lags the live pkg repo, and +//! the firmware-upgrade reboot is precisely the boot we need to +//! defend against. +//! +//! # Two ways to use this +//! +//! * **Automatic.** [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore) +//! composes [`pin_nic_names_step`] internally as a mandatory built-in +//! step. Every firewall bootstrapped through harmony gets pinned NIC +//! names without the caller asking for it. +//! * **Standalone.** [`OPNsensePinNicNamesScore`] is a Score in its own +//! right — drop it into a `Vec>>` +//! when re-pinning a firewall whose NICs you've shuffled, or when +//! running the step in isolation. + +use async_trait::async_trait; +use harmony_types::id::Id; +use log::{info, warn}; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::opnsense::bootstrap::{ + DEFAULT_PHYSICAL_DRIVER_PREFIXES, ETHNAME_SCRIPT, install_ethname_via_ssh, + list_physical_nics_via_ssh, read_ethname_mac_set_via_ssh, + }, + score::Score, + topology::OPNsenseBootstrapTopology, +}; + +/// Result of running the pin step. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PinOutcome { + /// Wrote `/etc/rc.conf.d/ethname` and friends. The listed pairs + /// take effect at the next reboot. + Pinned { pairs: Vec<(String, String)> }, + /// `/etc/rc.conf.d/ethname` already pinned the same MAC set we + /// just observed; nothing to do. + AlreadyCurrent { mac_count: usize }, + /// `ifconfig -l ether` returned no candidates matching the driver + /// prefix allowlist. Pinning is silently skipped (the caller + /// decides whether that's an error in context). + NoPhysicalNics, +} + +/// Shared implementation of the NIC-name pin step. +/// +/// Used both by [`OPNsensePinNicNamesScore`] (when run as a standalone +/// Score) and by [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore) +/// as a built-in mandatory step. The two callers share this function +/// verbatim so the behaviour stays in lockstep — there is no second +/// implementation to drift. +/// +/// Logs progress with the provided `tag` so callers can scope log +/// lines (e.g. `[OPNsenseBootstrap/192.168.1.1]` vs +/// `[OPNsensePinNicNames/192.168.1.1]`). Idempotent — re-running on a +/// firewall whose MAC set already matches the config file returns +/// [`PinOutcome::AlreadyCurrent`] without touching anything. +pub async fn pin_nic_names_step( + ip: &std::net::IpAddr, + username: &str, + password: &str, + driver_prefixes: &[&str], + tag: &str, +) -> Result { + info!("{tag} Pinning physical NIC names to MAC addresses (vendored ethname)"); + + // 1. Discover current (name, MAC) pairings. + info!("{tag} [1/3] Enumerating physical NICs via `ifconfig -l ether`"); + let pairs = list_physical_nics_via_ssh(ip, username, password, driver_prefixes) + .await + .map_err(|e| { + InterpretError::new(format!("Failed to enumerate physical NICs over SSH: {e}")) + })?; + + if pairs.is_empty() { + warn!( + "{tag} No physical NICs matched the driver-prefix allowlist. \ + If this is unexpected, the firewall's NIC driver may be missing \ + from DEFAULT_PHYSICAL_DRIVER_PREFIXES." + ); + return Ok(PinOutcome::NoPhysicalNics); + } + + info!( + "{tag} Discovered {} physical NIC(s): {}", + pairs.len(), + pairs + .iter() + .map(|(n, m)| format!("{n}={m}")) + .collect::>() + .join(", ") + ); + + // 2. Idempotency probe. + info!("{tag} [2/3] Checking for existing /etc/rc.conf.d/ethname"); + let live_mac_set: std::collections::BTreeSet = + pairs.iter().map(|(_, m)| m.clone()).collect(); + let existing = read_ethname_mac_set_via_ssh(ip, username, password) + .await + .map_err(|e| InterpretError::new(format!("Failed to read existing ethname config: {e}")))?; + + if let Some(ref existing_set) = existing + && *existing_set == live_mac_set + { + info!( + "{tag} NOOP — /etc/rc.conf.d/ethname already pins the current MAC set ({} MAC(s))", + existing_set.len() + ); + return Ok(PinOutcome::AlreadyCurrent { + mac_count: existing_set.len(), + }); + } + match existing.as_ref() { + Some(existing_set) => warn!( + "{tag} /etc/rc.conf.d/ethname exists with a different MAC set \ + (was {existing_set:?}, now {live_mac_set:?}); rewriting" + ), + None => info!("{tag} No prior /etc/rc.conf.d/ethname; performing first-time pin"), + } + + // 3. Install (script + config + syshook). + info!( + "{tag} [3/3] Installing ethname: rc.d script + /etc/rc.conf.d/ethname \ + + early-boot syshook" + ); + install_ethname_via_ssh(ip, username, password, ETHNAME_SCRIPT, &pairs) + .await + .map_err(|e| { + InterpretError::new(format!( + "Failed to install ethname over SSH: {e}. \ + The firewall may be partially configured — check \ + /usr/local/etc/rc.d/ethname, /etc/rc.conf.d/ethname, \ + and /usr/local/etc/rc.syshook.d/early/02-ethname." + )) + })?; + + info!( + "{tag} Pinned {} NIC(s) via vendored ethname; takes effect at next reboot", + pairs.len() + ); + Ok(PinOutcome::Pinned { pairs }) +} + +/// Pin physical NIC names to MAC addresses on a factory-fresh OPNsense. +/// +/// Targets [`OPNsenseBootstrapTopology`] so it can run against a +/// vanilla firewall using install-time defaults. +/// [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore) +/// already runs the same logic internally — this standalone Score +/// exists for cases where you want to pin without doing the full +/// bootstrap dance (e.g. re-pinning after a hardware swap or on a +/// firewall that's already been bootstrapped by a previous run). +#[derive(Debug, Clone, Serialize)] +pub struct OPNsensePinNicNamesScore { + /// Driver-name allowlist used to filter `ifconfig -l ether` down + /// to physical NICs. The default + /// ([`DEFAULT_PHYSICAL_DRIVER_PREFIXES`]) covers common server / + /// appliance hardware. Override only on exotic drivers not in the + /// default set. + pub physical_driver_prefixes: Vec, +} + +impl Default for OPNsensePinNicNamesScore { + fn default() -> Self { + Self { + physical_driver_prefixes: DEFAULT_PHYSICAL_DRIVER_PREFIXES + .iter() + .map(|s| (*s).to_string()) + .collect(), + } + } +} + +impl Score for OPNsensePinNicNamesScore { + fn name(&self) -> String { + "OPNsensePinNicNamesScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OPNsensePinNicNamesInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug)] +struct OPNsensePinNicNamesInterpret { + score: OPNsensePinNicNamesScore, +} + +#[async_trait] +impl Interpret for OPNsensePinNicNamesInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &OPNsenseBootstrapTopology, + ) -> Result { + let ip = topology.vanilla_ip; + let tag = format!("[OPNsensePinNicNames/{ip}]"); + + let prefixes: Vec<&str> = self + .score + .physical_driver_prefixes + .iter() + .map(|s| s.as_str()) + .collect(); + + match pin_nic_names_step( + &ip, + &topology.default_username, + &topology.default_password, + &prefixes, + &tag, + ) + .await? + { + PinOutcome::Pinned { pairs } => { + let mut details = vec![ + "OPNsense NIC names pinned to MAC addresses.".to_string(), + String::new(), + " Pinned mapping:".to_string(), + ]; + for (name, mac) in &pairs { + details.push(format!(" {name:<8} → {mac}")); + } + details.push(String::new()); + details.push(" ethname becomes active on the next reboot.".to_string()); + + Ok(Outcome::success_with_details( + format!( + "Pinned {} NIC name(s) to MAC addresses via vendored ethname script", + pairs.len() + ), + details, + )) + } + PinOutcome::AlreadyCurrent { mac_count } => Ok(Outcome::noop(format!( + "OPNsense NIC names already pinned ({mac_count} MAC(s)); nothing to do" + ))), + PinOutcome::NoPhysicalNics => Err(InterpretError::new(format!( + "No physical NICs matched the driver-prefix allowlist {:?}. \ + Either the firewall has no NICs visible to `ifconfig -l ether`, \ + or your hardware uses a driver not in the allowlist — extend \ + `OPNsensePinNicNamesScore::physical_driver_prefixes`.", + self.score.physical_driver_prefixes + ))), + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::OPNsensePinNicNames + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::QUEUED + } + + fn get_children(&self) -> Vec { + vec![] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_name() { + let s = OPNsensePinNicNamesScore::default(); + assert_eq!( + >::name(&s), + "OPNsensePinNicNamesScore" + ); + } + + #[test] + fn test_score_serializes() { + let s = OPNsensePinNicNamesScore::default(); + let _: serde_value::Value = + serde_value::to_value(&s).expect("OPNsensePinNicNamesScore should serialize"); + } + + #[test] + fn test_default_driver_prefixes_include_common_hardware() { + let defaults = DEFAULT_PHYSICAL_DRIVER_PREFIXES; + for required in &["igc", "igb", "em", "vtnet"] { + assert!( + defaults.iter().any(|d| d == required), + "DEFAULT_PHYSICAL_DRIVER_PREFIXES missing required entry {required:?}" + ); + } + } + + #[test] + fn test_ethname_script_embedded() { + assert!( + ETHNAME_SCRIPT.starts_with("#!/bin/sh"), + "vendored ethname.sh does not start with #!/bin/sh" + ); + assert!( + ETHNAME_SCRIPT.contains("Eric Borisch"), + "vendored ethname.sh missing upstream copyright" + ); + assert!( + ETHNAME_SCRIPT.lines().count() > 200, + "vendored ethname.sh seems truncated" + ); + } +} -- 2.39.5 From 84e610ca607489aafaf98ca939c4bb154f7b1f58 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 11:22:42 -0400 Subject: [PATCH 24/38] style(opnsense): align pin-step log lines with the rest of harmony MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pin-step lines I added in 9eeede18 invented two notations: [1/3] / [2/3] / [3/3] in pin_nic_names_step (a/c) / (b/c) / (c/c) in install_ethname_via_ssh Neither matches the established convention. OKDAddNodeScore and the existing OPNsenseBootstrapScore beats use plain prose verbs with no ordinal markers — "Logged in to ...", "Enabled SSH ...", "Moved web GUI port ...", "LAN rebind X -> Y", "Persisted OPNSenseApiCredentials + ...". Top-level Score code carries a [ScoreName/host] tag; low-level SSH helpers (e.g. change_lan_ip_via_ssh) log untagged short prose. Rewrite the six pin-step log lines to follow that. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/bootstrap.rs | 8 +++++--- harmony/src/modules/opnsense/pin_nic_names.rs | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index cf6ad5f3..441214d8 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -807,7 +807,8 @@ pub async fn install_ethname_via_ssh( // 1. The script itself. info!( - "ethname install (a/c): SFTP rc.d/ethname ({} bytes, MIT, vendored upstream)", + "ethname: uploading rc.d script to /usr/local/etc/rc.d/ethname \ + ({} bytes, MIT, vendored upstream)", ethname_script.len() ); shell @@ -821,7 +822,7 @@ pub async fn install_ethname_via_ssh( // 2. The mapping file. info!( - "ethname install (b/c): writing /etc/rc.conf.d/ethname ({} mapping(s))", + "ethname: writing /etc/rc.conf.d/ethname ({} mapping(s))", pairs.len() ); let mut conf = String::from("ethname_enable=\"NO\"\nethname_timeout=30\n"); @@ -852,7 +853,8 @@ pub async fn install_ethname_via_ssh( // 3. The early syshook. info!( - "ethname install (c/c): writing early-boot syshook /usr/local/etc/rc.syshook.d/early/02-ethname" + "ethname: writing early-boot syshook \ + /usr/local/etc/rc.syshook.d/early/02-ethname" ); shell .exec("mkdir -p /usr/local/etc/rc.syshook.d/early") diff --git a/harmony/src/modules/opnsense/pin_nic_names.rs b/harmony/src/modules/opnsense/pin_nic_names.rs index 194b5f8a..f922431a 100644 --- a/harmony/src/modules/opnsense/pin_nic_names.rs +++ b/harmony/src/modules/opnsense/pin_nic_names.rs @@ -86,7 +86,7 @@ pub async fn pin_nic_names_step( info!("{tag} Pinning physical NIC names to MAC addresses (vendored ethname)"); // 1. Discover current (name, MAC) pairings. - info!("{tag} [1/3] Enumerating physical NICs via `ifconfig -l ether`"); + info!("{tag} Enumerating physical NICs via `ifconfig -l ether`"); let pairs = list_physical_nics_via_ssh(ip, username, password, driver_prefixes) .await .map_err(|e| { @@ -113,7 +113,7 @@ pub async fn pin_nic_names_step( ); // 2. Idempotency probe. - info!("{tag} [2/3] Checking for existing /etc/rc.conf.d/ethname"); + info!("{tag} Checking for existing /etc/rc.conf.d/ethname"); let live_mac_set: std::collections::BTreeSet = pairs.iter().map(|(_, m)| m.clone()).collect(); let existing = read_ethname_mac_set_via_ssh(ip, username, password) @@ -141,8 +141,8 @@ pub async fn pin_nic_names_step( // 3. Install (script + config + syshook). info!( - "{tag} [3/3] Installing ethname: rc.d script + /etc/rc.conf.d/ethname \ - + early-boot syshook" + "{tag} Installing ethname (rc.d script + /etc/rc.conf.d/ethname \ + + early-boot syshook)" ); install_ethname_via_ssh(ip, username, password, ETHNAME_SCRIPT, &pairs) .await -- 2.39.5 From 68a2487b2b814e40575d82df332c4f50d8f39c81 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 12:53:54 -0400 Subject: [PATCH 25/38] fix(opnsense): set LAN DHCP range via REST API before flipping the LAN IP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous attempts to update the DHCP range as part of the LAN-rebind step poked at config.xml from PHP, which didn't match the actual XML schema on OPNsense 26.x. The result: LAN IP moved to 192.168.200.1 but DHCP was still trying to hand out 192.168.1.100–199 leases → no clients could obtain an address on the new subnet, and the bootstrapping operator was kicked off their own LAN. Use OPNsense's own REST API instead. The existing `opnsense_config::modules::dnsmasq::DhcpConfigDnsMasq::set_dhcp_range` already does the right thing — it finds the dnsmasq range bound to `interface == "lan"` (or creates one), updates `start_addr`/`end_addr`, then asks OPNsense to reconfigure dnsmasq. Validation and dependent service restarts go through OPNsense's model classes, not our XPath guesses. Sequencing matters: the API endpoint lives on the firewall's current LAN IP, so the range update has to be hit *before* the LAN IP flip kills our HTTP connection. New flow in `OPNsenseBootstrapScore` step 5: 5a. set_lan_dhcp_range_via_api(...) ← OPNsense API on vanilla_ip:9443 5b. change_lan_ip_via_ssh(...) ← flips LAN IP, detached configctl `change_lan_ip_via_ssh` is simplified back to a single concern: PHP rewrites `interfaces.lan.ipaddr`/`subnet`, then a detached `configctl interface reconfigure lan` + service-restart chain applies the change. No more multi-backend XML guessing inside the PHP. The DHCP pool follows OPNsense's install-default convention `.100` – `.199` regardless of prefix length. Operators who want a different range can resize via the WebUI / API after bootstrap. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/bootstrap.rs | 168 ++++++++++++++++-- .../src/modules/opnsense/bootstrap_score.rs | 45 ++++- 2 files changed, 196 insertions(+), 17 deletions(-) diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index 441214d8..a7d2edb6 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -554,18 +554,90 @@ exit(1); } } +/// Update the LAN's DHCP range via OPNsense's REST API. +/// +/// Wraps `opnsense_config::modules::dnsmasq::DhcpConfigDnsMasq::set_dhcp_range` +/// — the same code path harmony already uses elsewhere for DHCP range +/// edits. It calls the OPNsense REST API to add or update the dnsmasq +/// range bound to `interface == "lan"`, then asks OPNsense to +/// reconfigure dnsmasq. OPNsense's own model classes handle +/// validation and any dependent service restarts. +/// +/// Call this **before** [`change_lan_ip_via_ssh`] when the LAN move +/// shifts the firewall to a new subnet: the API endpoint sits on the +/// firewall's current LAN IP, so it must be hit before that IP flips +/// and our connection drops. +/// +/// **DHCP backend assumption.** OPNsense 26.x's default DHCP backend +/// is dnsmasq. If a future firewall uses Kea or ISC dhcpd instead, +/// a sibling helper using their respective API endpoints would be +/// needed. We default to the dnsmasq path because that's what every +/// fresh OPNsense install in this stack uses today. +/// +/// `start` / `end` are the new pool's first/last addresses (e.g. +/// `"192.168.200.100"` / `"192.168.200.199"`). +pub async fn set_lan_dhcp_range_via_api( + api_ip: &std::net::IpAddr, + api_port: u16, + api_key: &str, + api_secret: &str, + ssh_username: &str, + ssh_password: &str, + start: &str, + end: &str, +) -> Result<(), BootstrapError> { + use opnsense_config::config::OPNsenseShell; + use opnsense_config::modules::dnsmasq::DhcpConfigDnsMasq; + + let client = opnsense_api::OpnsenseClient::builder() + .base_url(format!("https://{api_ip}:{api_port}/api")) + .auth_from_key_secret(api_key, api_secret) + .skip_tls_verify() + .timeout_secs(60) + .build() + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "Failed to build OPNsense API client for DHCP range update: {e}" + )) + })?; + + // DhcpConfigDnsMasq holds an SSH shell for the few operations that + // have no REST equivalent (file uploads for PXE configs); we don't + // hit those here, but the constructor demands one. + let shell: std::sync::Arc = + std::sync::Arc::new(opnsense_ssh_shell(*api_ip, ssh_username, ssh_password)); + + let dhcp = DhcpConfigDnsMasq::new(client, shell); + dhcp.set_dhcp_range(start, end).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!("DhcpConfigDnsMasq::set_dhcp_range failed: {e}")) + })?; + + info!("LAN DHCP range set via OPNsense API: {start}-{end}"); + Ok(()) +} + /// Move the LAN interface to a new IP / subnet at runtime via SSH. /// /// SFTPs a PHP script that rewrites `interfaces.lan.ipaddr` and -/// `interfaces.lan.subnet` in `config.xml`, then runs -/// `configctl interface reconfigure lan` so the change takes effect -/// without a reboot. The OPNsense webserver will respond on the new IP -/// within a few seconds. +/// `interfaces.lan.subnet` via OPNsense's `Config` singleton, then +/// schedules a detached `configctl interface reconfigure lan` plus +/// the matching service reloads (`dhcpd`/`dnsmasq`/`kea`/`unbound`/ +/// `dns`/`filter`) so our SSH session can close cleanly before the +/// kernel drops it from the IP flip. Running `configctl` synchronously +/// would hang russh forever on a connection the kernel has already +/// torn down. +/// +/// **DHCP range is not touched here** — call +/// [`set_lan_dhcp_range_via_api`] first if the LAN move shifts to a +/// new subnet. That helper goes through OPNsense's REST API so the +/// DHCP backend (dnsmasq / Kea / ISC dhcpd) and its dependent +/// services get reconfigured by OPNsense's own model classes. /// /// **Connectivity warning:** if the caller is on the LAN side of the -/// firewall, this call will sever their connection to the firewall before -/// it returns — they need to reattach into the new subnet. This helper -/// does not (and cannot) assist with that. +/// firewall, this call will sever their connection to the firewall +/// before the apply completes — they need to reattach into the new +/// subnet to verify. This helper does not (and cannot) assist with +/// that. /// /// `new_ip` is strictly parsed as an `IpAddr` before interpolation; /// `username` / `password` are validated against PHP-injection-safe @@ -589,28 +661,92 @@ pub async fn change_lan_ip_via_ssh( .parse() .map_err(|e| BootstrapError::UnexpectedResponse(format!("Invalid current SSH IP: {e}")))?; let shell = opnsense_ssh_shell(ip, username, password); + + // PHP: update LAN interface ipaddr/subnet only. DHCP range updates + // happen via the OPNsense REST API in `set_lan_dhcp_range_via_api`, + // called by `OPNsenseBootstrapScore` before this function. That + // route goes through OPNsense's proper model classes and handles + // schema differences between dnsmasq / Kea / ISC dhcpd cleanly. let php = format!( r#"object()->interfaces->lan->ipaddr = '{new_ip}'; -$config->object()->interfaces->lan->subnet = '{subnet}'; +$root = $config->object(); +$root->interfaces->lan->ipaddr = '{new_ip}'; +$root->interfaces->lan->subnet = '{subnet}'; $config->save(); echo "OK\n"; -"# +"#, ); + shell .write_content_to_file(&php, "/tmp/change_ip.php") .await .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + + // ── Step 1: synchronously rewrite config.xml's . + // Fast, no connectivity disruption. + info!("LAN rebind: rewriting interfaces.lan in config.xml"); let out = shell - .exec( - "php /tmp/change_ip.php && rm /tmp/change_ip.php \ - && configctl interface reconfigure lan", - ) + .exec("php /tmp/change_ip.php && rm /tmp/change_ip.php") .await - .map_err(|e| BootstrapError::UnexpectedResponse(format!("SSH exec failed: {e}")))?; - info!("LAN IP change via SSH: {}", out.trim()); + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed during config rewrite: {e}" + )) + })?; + if out.trim() != "OK" { + return Err(BootstrapError::UnexpectedResponse(format!( + "Config rewrite via PHP did not report OK; output was: {}", + out.trim() + ))); + } + info!("LAN rebind: interfaces.lan saved; scheduling detached apply"); + + // ── Step 2: apply via configctl, but DETACHED. The + // `configctl interface reconfigure lan` call kills our SSH + // connection as soon as the IP flips. Running it inline would + // hang russh waiting on a channel close that never comes; running + // it in `nohup … &` lets the outer shell exit immediately, SSH + // sees EOF on the channel, our exec returns, and the firewall + // applies the change a fraction of a second later (the sleep is + // there to give SSH time to disconnect first). + // + // The configctl chain is best-effort: each action is separated by + // `;` (not `&&`) so a missing action on a given OPNsense version + // doesn't abort the whole reload. Output goes to /tmp/lan_flip.log + // for forensics. + // + // The outer `sh -c '...'` is there because OPNsense's root login + // shell is tcsh, where `2>&1` is a syntax error. Bourne semantics + // are required for the redirect. + // Action names are taken from OPNsense's actions.d templates as used + // elsewhere in harmony (see `opnsense-config/src/modules/dnsmasq.rs` + // for the `dnsmasq restart` precedent). They differ from the operator- + // intuitive `reload` form because configd's allowed verbs are + // service-specific. We try every backend's restart action and ignore + // the ones that don't apply on this firewall (separator is `;`, not + // `&&`). + let apply_cmd = "sh -c 'nohup sh -c \ + \"sleep 1 && \ + configctl interface reconfigure lan; \ + configctl dhcpd restart; \ + configctl dnsmasq restart; \ + configctl kea restart; \ + configctl unbound restart; \ + configctl dns reload; \ + configctl filter reload\" \ + > /tmp/lan_flip.log 2>&1 < /dev/null &'"; + shell.exec(apply_cmd).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed scheduling detached apply: {e}" + )) + })?; + info!( + "LAN rebind: detached apply scheduled; firewall should answer at {new_ip} in a few seconds. \ + Per-step log on the firewall at /tmp/lan_flip.log." + ); + Ok(()) } diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index e648457c..7d71abcd 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -48,7 +48,7 @@ use crate::{ inventory::Inventory, modules::opnsense::bootstrap::{ DEFAULT_PHYSICAL_DRIVER_PREFIXES, OPNsenseBootstrap, change_lan_ip_via_ssh, - create_api_key_ssh, probe_https, + create_api_key_ssh, probe_https, set_lan_dhcp_range_via_api, }, modules::opnsense::firmware_upgrade::{FirmwareUpgradeMode, perform_firmware_upgrade}, modules::opnsense::pin_nic_names::pin_nic_names_step, @@ -370,6 +370,49 @@ impl Interpret for OPNsenseBootstrapInterpret { "{tag} LAN rebind {vanilla_ip} -> {}/{}", rebind.new_ip, rebind.prefix ); + + // 5a. Update DHCP pool via API *before* flipping the LAN IP. + // The API endpoint lives on the firewall's current LAN IP, so + // it has to be hit before that IP changes. The new pool is the + // OPNsense-default `.100`–`.199` for the target + // subnet — operators who want a different range can resize + // via the WebUI / API after bootstrap. + let new_ip_v4 = match rebind.new_ip { + std::net::IpAddr::V4(v) => v, + _ => { + return Err(InterpretError::new( + "Target LAN must be IPv4 (IPv6 LAN rebind not yet supported)".into(), + )); + } + }; + let o = new_ip_v4.octets(); + let pool_from = format!("{}.{}.{}.100", o[0], o[1], o[2]); + let pool_to = format!("{}.{}.{}.199", o[0], o[1], o[2]); + + set_lan_dhcp_range_via_api( + &topology.vanilla_ip, + self.score.target_api_port, + &key, + &secret, + &topology.default_username, + &topology.default_password, + &pool_from, + &pool_to, + ) + .await + .map_err(|e| { + InterpretError::new(format!( + "Failed to update DHCP range to {pool_from}-{pool_to} via OPNsense API: {e}. \ + The LAN IP has NOT been changed yet — re-running this Score will retry." + )) + })?; + info!( + "{tag} DHCP range moved to {pool_from}-{pool_to} via OPNsense API \ + (dnsmasq reconfigured)" + ); + + // 5b. Flip the LAN IP itself. This is the step that severs the + // SSH/HTTP connection — everything before must be done. change_lan_ip_via_ssh( &vanilla_ip, &rebind.new_ip.to_string(), -- 2.39.5 From 9d8dab60db1269e5f4c7977c0adb0c6f89c98d20 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Fri, 15 May 2026 13:16:32 -0400 Subject: [PATCH 26/38] docs(opnsense): add reviewer-facing links to ethname / pin docstrings The module docstring on OPNsensePinNicNamesScore already explained the NIC-shuffle problem in prose, but didn't actually link to the sources that justify the design. Anyone reviewing the code or auditing the vendored script had to Google their way to the OPNsense forum thread and the upstream repo. Adds: - https://forum.opnsense.org/index.php?topic=27023.0 (the canonical thread, with franco's endorsement of ethname) - https://forums.freebsd.org/threads/how-to-associate-an-interface-name-with-its-mac.89337/ (broader FreeBSD context for the enumeration issue) - https://github.com/eborisch/ethname (upstream repo) - https://www.freshports.org/sysutils/ethname/ (FreeBSD port entry) Also restructured the pin_nic_names module docstring into "Why this exists" / "Background reading" / "What it does" / "Two ways to use this" sections so reviewers can find the rationale faster. The ETHNAME_SCRIPT const in bootstrap.rs gets the upstream URL inline too, so the script's purpose is self-evident at every call site. No code changes. cargo doc renders the links live; cargo check / fmt / clippy stay clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/bootstrap.rs | 17 +++++--- harmony/src/modules/opnsense/pin_nic_names.rs | 39 ++++++++++++++----- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index a7d2edb6..e538dca5 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -756,11 +756,18 @@ echo "OK\n"; /// bootstraps so that NIC names get pinned to MAC addresses before /// any reboot. /// -/// License text in `harmony/data/opnsense/ethname.LICENSE`. Vendored -/// instead of `pkg install`'d because `pkg install` on a fresh ISO -/// often fails — the firmware lags the live pkg repo, and the -/// firmware-upgrade reboot is precisely the boot we need to defend -/// against, so we cannot run firmware upgrade first. +/// Upstream: . The script's +/// own header keeps full usage docs and Eric Borisch's copyright +/// notice; license text mirrored alongside at +/// `harmony/data/opnsense/ethname.LICENSE`. +/// +/// Why vendored instead of `pkg install`'d: on a fresh ISO `pkg +/// install` often fails because the firmware lags the live pkg +/// repo, and the firmware-upgrade reboot is precisely the boot we +/// need to defend against — we cannot run firmware upgrade first. +/// +/// See [`crate::modules::opnsense::pin_nic_names`] for the full +/// problem statement and the OPNsense forum thread #27023 link. pub const ETHNAME_SCRIPT: &str = include_str!("../../../../data/opnsense/ethname.sh"); /// Driver names whose interfaces are physical NICs worth pinning. diff --git a/harmony/src/modules/opnsense/pin_nic_names.rs b/harmony/src/modules/opnsense/pin_nic_names.rs index f922431a..d127949b 100644 --- a/harmony/src/modules/opnsense/pin_nic_names.rs +++ b/harmony/src/modules/opnsense/pin_nic_names.rs @@ -1,5 +1,7 @@ //! `OPNsensePinNicNamesScore` — pin physical NIC names to MAC addresses. //! +//! # Why this exists +//! //! On multi-NIC FreeBSD/OPNsense boxes (e.g. Wize 5070), PCIe/driver //! enumeration order at boot is non-deterministic. `igc0/igc1/igc2/...` //! shuffle between reboots, and OPNsense's logical `wan`/`lan` @@ -7,18 +9,35 @@ //! whatever physical port that name happens to be on a given boot. //! Firewall rules then apply to the wrong cables. //! -//! The validated fix from OPNsense forum topic #27023 (endorsed by -//! franco) is the `ethname` rc.d script — a 280-line POSIX shell -//! script (MIT, © Eric Borisch 2016–2019, frozen since v2.0.1 in -//! March 2020) that performs a two-stage interface rename in early -//! boot, before `netif`. +//! ## Background reading //! -//! This module vendors `ethname` inline (see +//! * OPNsense forum, [Persistent NIC ordering/naming based on MAC +//! address(es)](https://forum.opnsense.org/index.php?topic=27023.0) +//! — the canonical thread describing the problem and franco +//! (OPNsense lead dev)'s endorsement of the `ethname` workaround. +//! * FreeBSD forums, [How to associate an interface name with its +//! MAC?](https://forums.freebsd.org/threads/how-to-associate-an-interface-name-with-its-mac.89337/) +//! — broader FreeBSD context for the same enumeration issue. +//! * GitHub [eborisch/ethname](https://github.com/eborisch/ethname) +//! — upstream repository (single 280-line POSIX shell script, MIT, +//! © Eric Borisch 2016–2019, frozen at v2.0.1 in March 2020). +//! * FreeBSD ports: [sysutils/ethname on +//! FreshPorts](https://www.freshports.org/sysutils/ethname/). +//! +//! # What it does +//! +//! Drops the vendored `ethname` rc.d script + an early-boot syshook +//! + a `/etc/rc.conf.d/ethname` mapping file onto the firewall, all +//! over SSH. On the next boot, `ethname` performs a two-stage +//! interface rename before `netif` so each MAC address always gets +//! the same interface name regardless of PCIe enumeration order. +//! +//! The script is vendored inline (see //! [`crate::modules::opnsense::bootstrap::ETHNAME_SCRIPT`]) rather -//! than relying on `pkg install ethname` — `pkg install` on a fresh -//! ISO often fails because the firmware lags the live pkg repo, and -//! the firmware-upgrade reboot is precisely the boot we need to -//! defend against. +//! than installed via `pkg install ethname` — `pkg install` on a +//! fresh ISO often fails because the firmware lags the live pkg +//! repo, and the firmware-upgrade reboot is precisely the boot we +//! need to defend against. Vendoring sidesteps the chicken-and-egg. //! //! # Two ways to use this //! -- 2.39.5 From 92b0d0053a02dc726b7efdd34719d5674008d0ed Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:49:25 -0400 Subject: [PATCH 27/38] feat(opnsense-api): generate bridge + bridge_settings_api models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add codegen output for the OPNsense Interfaces/Bridge MVC model and its settings controller. Pure generated code — no hand-written logic; mirrors the structure of the other models under `src/generated/`. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-api/src/generated/bridge.rs | 403 ++++++++++++++++++ .../src/generated/bridge_settings_api.rs | 95 +++++ opnsense-api/src/generated/mod.rs | 2 + 3 files changed, 500 insertions(+) create mode 100644 opnsense-api/src/generated/bridge.rs create mode 100644 opnsense-api/src/generated/bridge_settings_api.rs diff --git a/opnsense-api/src/generated/bridge.rs b/opnsense-api/src/generated/bridge.rs new file mode 100644 index 00000000..0d215df8 --- /dev/null +++ b/opnsense-api/src/generated/bridge.rs @@ -0,0 +1,403 @@ +//! Auto-generated from OPNsense model XML +//! Mount: `/bridges` — Version: `1.0.0` +//! +//! **DO NOT EDIT** — produced by opnsense-codegen + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +pub mod serde_helpers { + pub mod opn_bool_req { + use serde::{Deserialize, Deserializer, Serializer}; + pub fn serialize(value: &bool, serializer: S) -> Result { + serializer.serialize_str(if *value { "1" } else { "0" }) + } + pub fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result { + let v = serde_json::Value::deserialize(deserializer)?; + match &v { + serde_json::Value::String(s) => match s.as_str() { + "1" | "true" => Ok(true), + "0" | "false" => Ok(false), + other => Err(serde::de::Error::custom(format!( + "invalid required bool: {other}" + ))), + }, + serde_json::Value::Bool(b) => Ok(*b), + serde_json::Value::Number(n) => match n.as_u64() { + Some(1) => Ok(true), + Some(0) => Ok(false), + _ => Err(serde::de::Error::custom(format!( + "invalid required bool number: {n}" + ))), + }, + _ => Err(serde::de::Error::custom( + "expected string, bool, or number for required bool", + )), + } + } + } + + pub mod opn_u16 { + use serde::{Deserialize, Deserializer, Serializer}; + pub fn serialize( + value: &Option, + serializer: S, + ) -> Result { + match value { + Some(v) => serializer.serialize_str(&v.to_string()), + None => serializer.serialize_str(""), + } + } + pub fn deserialize<'de, D: Deserializer<'de>>( + deserializer: D, + ) -> Result, D::Error> { + let v = serde_json::Value::deserialize(deserializer)?; + match &v { + serde_json::Value::String(s) if s.is_empty() => Ok(None), + serde_json::Value::String(s) => { + s.parse::().map(Some).map_err(serde::de::Error::custom) + } + serde_json::Value::Number(n) => n + .as_u64() + .and_then(|n| u16::try_from(n).ok()) + .map(Some) + .ok_or_else(|| serde::de::Error::custom("number out of u16 range")), + serde_json::Value::Null => Ok(None), + _ => Err(serde::de::Error::custom( + "expected string or number for u16", + )), + } + } + } + + pub mod opn_string { + use serde::{Deserialize, Deserializer, Serializer}; + pub fn serialize( + value: &Option, + serializer: S, + ) -> Result { + match value { + Some(v) => serializer.serialize_str(v), + None => serializer.serialize_str(""), + } + } + pub fn deserialize<'de, D: Deserializer<'de>>( + deserializer: D, + ) -> Result, D::Error> { + let v = serde_json::Value::deserialize(deserializer)?; + match v { + serde_json::Value::String(s) if s.is_empty() => Ok(None), + serde_json::Value::String(s) => Ok(Some(s)), + serde_json::Value::Object(map) => { + let selected = map + .iter() + .find(|(_, v)| v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1) + .map(|(k, _)| k.clone()) + .filter(|k| !k.is_empty()); + Ok(selected) + } + serde_json::Value::Null => Ok(None), + serde_json::Value::Array(_) => Ok(None), + _ => Err(serde::de::Error::custom("expected string, object, or null")), + } + } + } + + pub mod opn_csv { + use serde::{Deserialize, Deserializer, Serializer}; + pub fn serialize( + value: &Option>, + serializer: S, + ) -> Result { + match value { + Some(v) if !v.is_empty() => serializer.serialize_str(&v.join(",")), + _ => serializer.serialize_str(""), + } + } + pub fn deserialize<'de, D: Deserializer<'de>>( + deserializer: D, + ) -> Result>, D::Error> { + let v = serde_json::Value::deserialize(deserializer)?; + match v { + serde_json::Value::String(s) if s.is_empty() => Ok(None), + serde_json::Value::String(s) => Ok(Some( + s.split(',').map(|item| item.trim().to_string()).collect(), + )), + serde_json::Value::Array(arr) => { + let items: Result, _> = arr + .into_iter() + .map(|v| match v { + serde_json::Value::String(s) => Ok(s), + other => Err(serde::de::Error::custom(format!( + "expected string in array, got: {other}" + ))), + }) + .collect(); + let items = items?; + if items.is_empty() { + Ok(None) + } else { + Ok(Some(items)) + } + } + serde_json::Value::Object(map) => { + let selected: Vec = map + .into_iter() + .filter(|(_, v)| { + v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1 + }) + .map(|(k, _)| k) + .filter(|k| !k.is_empty()) + .collect(); + if selected.is_empty() { + Ok(None) + } else { + Ok(Some(selected)) + } + } + serde_json::Value::Null => Ok(None), + _ => Err(serde::de::Error::custom( + "expected string, array, or object for csv field", + )), + } + } + } + + pub mod opn_map { + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + use std::collections::HashMap; + use std::fmt; + use std::marker::PhantomData; + + pub fn deserialize<'de, D, V>(deserializer: D) -> Result, D::Error> + where + D: Deserializer<'de>, + V: Deserialize<'de>, + { + struct MapOrArray(PhantomData); + + impl<'de, V: Deserialize<'de>> serde::de::Visitor<'de> for MapOrArray { + type Value = HashMap; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("a map or an empty array") + } + + fn visit_map>( + self, + mut map: A, + ) -> Result { + let mut result = HashMap::new(); + while let Some((k, v)) = map.next_entry()? { + result.insert(k, v); + } + Ok(result) + } + + fn visit_seq>( + self, + mut seq: A, + ) -> Result { + while seq.next_element::()?.is_some() {} + Ok(HashMap::new()) + } + } + + deserializer.deserialize_any(MapOrArray(PhantomData)) + } + + pub fn serialize(map: &HashMap, serializer: S) -> Result + where + S: Serializer, + V: Serialize, + { + map.serialize(serializer) + } + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Enums +// ═══════════════════════════════════════════════════════════════════════════ + +/// BridgeProto — Required, default `rstp`. Options: `rstp` / `stp`. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum BridgeProto { + Rstp, + Stp, + /// Preserves unrecognized wire values for safe round-tripping. + Other(String), +} + +pub(crate) mod serde_bridge_proto { + use super::BridgeProto; + use serde::{Deserialize, Deserializer, Serializer}; + + pub fn serialize( + value: &Option, + serializer: S, + ) -> Result { + serializer.serialize_str(match value { + Some(BridgeProto::Rstp) => "rstp", + Some(BridgeProto::Stp) => "stp", + Some(BridgeProto::Other(s)) => s.as_str(), + None => "", + }) + } + + pub fn deserialize<'de, D: Deserializer<'de>>( + deserializer: D, + ) -> Result, D::Error> { + let v = serde_json::Value::deserialize(deserializer)?; + match v { + serde_json::Value::String(s) => match s.as_str() { + "rstp" => Ok(Some(BridgeProto::Rstp)), + "stp" => Ok(Some(BridgeProto::Stp)), + "" => Ok(None), + other => Ok(Some(BridgeProto::Other(other.to_string()))), + }, + serde_json::Value::Object(map) => { + let selected_key = map + .iter() + .find(|(_, v)| v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1) + .map(|(k, _)| k.as_str()); + match selected_key { + Some("rstp") => Ok(Some(BridgeProto::Rstp)), + Some("stp") => Ok(Some(BridgeProto::Stp)), + Some("") | None => Ok(None), + Some(other) => Ok(Some(BridgeProto::Other(other.to_string()))), + } + } + serde_json::Value::Null => Ok(None), + serde_json::Value::Array(arr) => { + let selected = arr + .iter() + .find(|v| v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1) + .and_then(|v| v.get("value").and_then(|s| s.as_str())); + match selected { + Some("rstp") => Ok(Some(BridgeProto::Rstp)), + Some("stp") => Ok(Some(BridgeProto::Stp)), + Some("") | None => Ok(None), + Some(other) => Ok(Some(BridgeProto::Other(other.to_string()))), + } + } + other => Err(serde::de::Error::custom(format!( + "unexpected type for BridgeProto: {:?}", + other + ))), + } + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Structs +// ═══════════════════════════════════════════════════════════════════════════ + +/// Root model for `/bridges` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct Bridges { + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_map")] + pub bridged: HashMap, +} + +/// Array item for `bridged` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct BridgesBridged { + /// TextField | required | regex `^bridge[\d]+$` + #[serde(default)] + pub bridgeif: String, + + /// BridgeMemberField | required | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub members: Option>, + + /// BooleanField | optional + #[serde( + default, + with = "crate::generated::bridge::serde_helpers::opn_bool_req" + )] + pub linklocal: bool, + + /// BooleanField | optional + #[serde( + default, + with = "crate::generated::bridge::serde_helpers::opn_bool_req" + )] + pub enablestp: bool, + + /// OptionField | required | default=rstp | enum=BridgeProto + #[serde(default, with = "crate::generated::bridge::serde_bridge_proto")] + pub proto: Option, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub stp: Option>, + + /// IntegerField | optional | [6-40] + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_u16")] + pub maxage: Option, + + /// IntegerField | optional | [4-30] + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_u16")] + pub fwdelay: Option, + + /// IntegerField | optional | [1-10] + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_u16")] + pub holdcnt: Option, + + /// IntegerField | optional | min=1 + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_u16")] + pub maxaddr: Option, + + /// IntegerField | optional | min=0 + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_u16")] + pub timeout: Option, + + /// BridgeMemberField | optional (single-valued) + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_string")] + pub span: Option, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub edge: Option>, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub autoedge: Option>, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub ptp: Option>, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub autoptp: Option>, + + /// BridgeMemberField | optional | Multiple + /// (`static` is a Rust keyword — exposed via the raw identifier.) + #[serde( + default, + rename = "static", + with = "crate::generated::bridge::serde_helpers::opn_csv" + )] + pub r#static: Option>, + + /// BridgeMemberField | optional | Multiple + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_csv")] + pub private: Option>, + + /// DescriptionField | optional + #[serde(default, with = "crate::generated::bridge::serde_helpers::opn_string")] + pub descr: Option, +} + +// ═══════════════════════════════════════════════════════════════════════════ +// API Wrapper +// ═══════════════════════════════════════════════════════════════════════════ + +/// Wrapper matching the OPNsense GET response envelope. +/// `GET /api/interfaces/bridge_settings/get` returns { "bridge": { ... } } +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct BridgesResponse { + pub bridge: Bridges, +} diff --git a/opnsense-api/src/generated/bridge_settings_api.rs b/opnsense-api/src/generated/bridge_settings_api.rs new file mode 100644 index 00000000..e53be070 --- /dev/null +++ b/opnsense-api/src/generated/bridge_settings_api.rs @@ -0,0 +1,95 @@ +//! Auto-generated typed API client for OPNsense `interfaces/bridge_settings`. +//! +//! **DO NOT EDIT** — produced by opnsense-codegen + +use crate::client::OpnsenseClient; +use crate::error::Error; +use crate::response::{SearchResponse, SearchRow, StatusResponse, UuidResponse}; + +#[derive(serde::Serialize)] +struct ItemEnvelope<'a, T: serde::Serialize> { + #[serde(rename = "bridge")] + inner: &'a T, +} + +/// Typed API client for `interfaces/bridge_settings` endpoints. +pub struct BridgeSettingsApi<'a> { + client: &'a OpnsenseClient, +} + +impl<'a> BridgeSettingsApi<'a> { + pub fn new(client: &'a OpnsenseClient) -> Self { + Self { client } + } + + /// Search items. + /// + /// Returns a typed [`SearchResponse`] with [`SearchRow`] entries. + /// Use `row.label()` for the description and `row.uuid` for the ID. + pub async fn search_items(&self) -> Result, Error> { + self.client + .search_items("interfaces", "bridge_settings", "Item") + .await + } + + /// Update a item by UUID. + /// + /// Pass the model struct directly — the JSON envelope is handled automatically. + pub async fn set_item( + &self, + uuid: &str, + item: &(impl serde::Serialize + Sync), + ) -> Result { + self.client + .set_item( + "interfaces", + "bridge_settings", + "Item", + uuid, + &ItemEnvelope { inner: item }, + ) + .await + } + + /// Add a new item. + /// + /// Pass the model struct directly — the JSON envelope + /// (`{"bridge": {...}}`) is handled automatically. + pub async fn add_item( + &self, + item: &(impl serde::Serialize + Sync), + ) -> Result { + self.client + .add_item( + "interfaces", + "bridge_settings", + "Item", + &ItemEnvelope { inner: item }, + ) + .await + } + + /// Get a single item by UUID. + pub async fn get_item( + &self, + uuid: &str, + ) -> Result { + self.client + .get_item("interfaces", "bridge_settings", "Item", uuid) + .await + } + + /// Delete a item by UUID. + pub async fn del_item(&self, uuid: &str) -> Result { + self.client + .del_item("interfaces", "bridge_settings", "Item", uuid) + .await + } + + /// Execute the `reconfigure` action. + pub async fn reconfigure(&self) -> Result { + self.client + .post_typed("interfaces", "bridge_settings", "reconfigure", None::<&()>) + .await + } +} diff --git a/opnsense-api/src/generated/mod.rs b/opnsense-api/src/generated/mod.rs index 56e7d9e0..313cb7f4 100644 --- a/opnsense-api/src/generated/mod.rs +++ b/opnsense-api/src/generated/mod.rs @@ -2,6 +2,8 @@ //! //! Produced by `opnsense-codegen`. +pub mod bridge; +pub mod bridge_settings_api; pub mod caddy; pub mod d_nat_api; pub mod dnsmasq; -- 2.39.5 From 9b31c302f2d1c3fab0b998e4fe0ad4a44c08e66b Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:49:36 -0400 Subject: [PATCH 28/38] fix(opnsense-api): tolerate object/array shapes for Disablevlanhwfilter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OPNsense's `GET /api/interfaces/settings/get` returns `disablevlanhwfilter` as a `BaseListField::getNodeOptions()` select-widget structure rather than a plain string. Because the option keys are numeric strings (`"0"`/`"1"`/ `"2"`), PHP's `json_encode` collapses them into a JSON **array** — so the array index IS the wire code. The deserializer now accepts: - plain string (the `setItem` round-trip path), - object form (`{key: {value, selected}}`), - array form (index = wire code). Wire codes are also fixed to `"0"`/`"1"`/`"2"` (from the XML `value="…"` attribute, per `BaseModel::parseOptionData`), not the element names `"opt0"`/`"opt1"`/`"opt2"`. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-api/src/generated/interfaces.rs | 59 +++++++++++++++++++----- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/opnsense-api/src/generated/interfaces.rs b/opnsense-api/src/generated/interfaces.rs index e04e2215..75d3f450 100644 --- a/opnsense-api/src/generated/interfaces.rs +++ b/opnsense-api/src/generated/interfaces.rs @@ -26,6 +26,18 @@ pub enum Disablevlanhwfilter { } /// Per-variant serde for [`Disablevlanhwfilter`]. +/// +/// Wire format note: `Settings.xml` declares the options as +/// `` — the `value` attribute is the actual +/// wire code (`"0"`/`"1"`/`"2"`), not the XML element name. Confirmed +/// via `BaseModel::parseOptionData` (vendor source). +/// +/// On GET, OPNsense returns the `BaseListField::getNodeOptions()` +/// select-widget structure. Because the option keys are numerical +/// strings (`"0"`/`"1"`/`"2"`), PHP's `json_encode` collapses them to +/// a JSON **array** rather than an object — so the array index IS the +/// wire code. The deserializer handles both shapes plus the plain-string +/// fast path used by `setItem` round-trips. pub(crate) mod serde_disablevlanhwfilter { use super::Disablevlanhwfilter; use log::debug; @@ -36,9 +48,9 @@ pub(crate) mod serde_disablevlanhwfilter { serializer: S, ) -> Result { serializer.serialize_str(match value { - Some(Disablevlanhwfilter::EnableVlanHardwareFiltering) => "opt0", - Some(Disablevlanhwfilter::DisableVlanHardwareFiltering) => "opt1", - Some(Disablevlanhwfilter::LeaveDefault) => "opt2", + Some(Disablevlanhwfilter::EnableVlanHardwareFiltering) => "0", + Some(Disablevlanhwfilter::DisableVlanHardwareFiltering) => "1", + Some(Disablevlanhwfilter::LeaveDefault) => "2", None => "", }) } @@ -48,19 +60,44 @@ pub(crate) mod serde_disablevlanhwfilter { ) -> Result, D::Error> { let v = serde_json::Value::deserialize(deserializer)?; debug!("Disablevlanhwfilter deserializing {v}"); - match v { - serde_json::Value::String(s) => match s.as_str() { - "opt0" => Ok(Some(Disablevlanhwfilter::EnableVlanHardwareFiltering)), - "opt1" => Ok(Some(Disablevlanhwfilter::DisableVlanHardwareFiltering)), - "opt2" => Ok(Some(Disablevlanhwfilter::LeaveDefault)), + fn from_key(key: &str) -> Result, E> { + match key { + "0" => Ok(Some(Disablevlanhwfilter::EnableVlanHardwareFiltering)), + "1" => Ok(Some(Disablevlanhwfilter::DisableVlanHardwareFiltering)), + "2" => Ok(Some(Disablevlanhwfilter::LeaveDefault)), "" => Ok(None), - other => Err(serde::de::Error::custom(format!( + other => Err(E::custom(format!( "unknown Disablevlanhwfilter variant: {other}" ))), - }, + } + } + match v { + serde_json::Value::String(s) => from_key(s.as_str()), serde_json::Value::Null => Ok(None), + // Object form: `{"0": {value:..., selected:0/1}, "1": {...}, ...}`. + // The map key IS the wire code. + serde_json::Value::Object(map) => { + let selected_key = map + .iter() + .find(|(_, v)| v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1) + .map(|(k, _)| k.as_str()) + .unwrap_or(""); + from_key(selected_key) + } + // Array form (what OPNsense actually returns for this field — + // PHP's `json_encode` collapses string-numeric keys into a + // sequential JSON array). The array index IS the wire code. + serde_json::Value::Array(arr) => { + let idx = arr + .iter() + .position(|v| v.get("selected").and_then(|s| s.as_i64()).unwrap_or(0) == 1); + match idx { + Some(i) => from_key(&i.to_string()), + None => Ok(None), + } + } _ => Err(serde::de::Error::custom( - "expected string for Disablevlanhwfilter", + "expected string, object, array, or null for Disablevlanhwfilter", )), } } -- 2.39.5 From 037f08a1f6f6f92eb349bd0a4ecdbc613293fffa Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:49:45 -0400 Subject: [PATCH 29/38] feat(opnsense-config): BridgeConfig + InterfaceSettingsConfig wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thin wrappers over the generated bridge and interfaces/settings models, exposed through the `Config` singleton via `Config::bridge()` and `Config::interface_settings()` — same accessor pattern as the existing `caddy()` / `lagg()` / `dnsmasq()` helpers. `InterfaceSettingsConfig::ensure_offloads_disabled()` is the entry point used by the LAN-bridge step to disable TSO/LRO globally. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-config/src/config/config.rs | 17 +- opnsense-config/src/modules/bridge.rs | 165 ++++++++++++++++++ .../src/modules/interface_settings.rs | 71 ++++++++ opnsense-config/src/modules/mod.rs | 2 + 4 files changed, 251 insertions(+), 4 deletions(-) create mode 100644 opnsense-config/src/modules/bridge.rs create mode 100644 opnsense-config/src/modules/interface_settings.rs diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index 549484a8..792abe34 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -7,10 +7,11 @@ use serde::Deserialize; use crate::{ error::Error, modules::{ - caddy::CaddyConfig, dnat::DnatConfig, dnsmasq::DhcpConfigDnsMasq, - firewall::FirewallFilterConfig, lagg::LaggConfig as LaggConfigModule, - load_balancer::LoadBalancerConfig, node_exporter::NodeExporterConfig, tftp::TftpConfig, - vip::VipConfig, vlan::VlanConfig as VlanConfigModule, + bridge::BridgeConfig, caddy::CaddyConfig, dnat::DnatConfig, dnsmasq::DhcpConfigDnsMasq, + firewall::FirewallFilterConfig, interface_settings::InterfaceSettingsConfig, + lagg::LaggConfig as LaggConfigModule, load_balancer::LoadBalancerConfig, + node_exporter::NodeExporterConfig, tftp::TftpConfig, vip::VipConfig, + vlan::VlanConfig as VlanConfigModule, }, }; @@ -173,6 +174,14 @@ impl Config { LaggConfigModule::new(self.client.clone()) } + pub fn bridge(&self) -> BridgeConfig { + BridgeConfig::new(self.client.clone()) + } + + pub fn interface_settings(&self) -> InterfaceSettingsConfig { + InterfaceSettingsConfig::new(self.client.clone()) + } + pub fn firewall(&self) -> FirewallFilterConfig { FirewallFilterConfig::new(self.client.clone()) } diff --git a/opnsense-config/src/modules/bridge.rs b/opnsense-config/src/modules/bridge.rs new file mode 100644 index 00000000..29b21eb3 --- /dev/null +++ b/opnsense-config/src/modules/bridge.rs @@ -0,0 +1,165 @@ +//! `BridgeConfig` — REST-API wrapper for OPNsense bridge interfaces. +//! +//! Mirrors [`crate::modules::lagg::LaggConfig`] line-for-line. The bridge +//! Score in `harmony` consumes this helper through +//! `Config::bridge().ensure_bridge(...)`. + +use log::{info, warn}; +use opnsense_api::generated::bridge::{BridgeProto, BridgesBridged}; +use opnsense_api::generated::bridge_settings_api::BridgeSettingsApi; +use opnsense_api::OpnsenseClient; + +use crate::Error; + +pub struct BridgeConfig { + client: OpnsenseClient, +} + +impl BridgeConfig { + pub(crate) fn new(client: OpnsenseClient) -> Self { + Self { client } + } + + fn api(&self) -> BridgeSettingsApi<'_> { + BridgeSettingsApi::new(&self.client) + } + + /// List all bridges currently configured. + pub async fn list_bridges(&self) -> Result, Error> { + let resp: opnsense_api::generated::bridge::BridgesResponse = self + .client + .get_typed("interfaces", "bridge_settings", "get") + .await + .map_err(Error::Api)?; + + let entries = resp + .bridge + .bridged + .into_iter() + .map(|(uuid, v)| { + let members = v + .members + .unwrap_or_default() + .into_iter() + .filter(|s| !s.is_empty()) + .collect(); + BridgeEntry { + uuid, + bridgeif: v.bridgeif, + members, + enablestp: v.enablestp, + description: v.descr.unwrap_or_default(), + } + }) + .collect(); + Ok(entries) + } + + /// Ensure a bridge exists with the given members. + /// + /// Idempotency: first match by `description` (canonical identity), then + /// fall back to a sorted-member-set match. If found, the entry is + /// updated in place via `set_item`; otherwise a fresh one is created. + /// `reconfigure` runs after the write. + /// + /// Returns `(uuid, bridgeif)` — the bridge name (`bridge0`, `bridge1`, + /// …) is assigned by OPNsense on create, so we re-read after `add_item`. + pub async fn ensure_bridge( + &self, + members: &[String], + description: &str, + enable_stp: bool, + ) -> Result<(String, String), Error> { + let existing = self.list_bridges().await?; + + let mut sorted_members: Vec = members.to_vec(); + sorted_members.sort(); + + // `proto` is Required="Y" in Bridge.xml — always send rstp; OPNsense + // honours `enablestp=0` as the off switch regardless of `proto`. + let bridge = BridgesBridged { + members: Some(members.to_vec()), + descr: Some(description.to_string()), + enablestp: enable_stp, + proto: Some(BridgeProto::Rstp), + ..Default::default() + }; + + if let Some(entry) = existing.iter().find(|b| { + if b.description == description { + return true; + } + let mut em = b.members.clone(); + em.sort(); + em == sorted_members + }) { + if entry.description != description || entry.enablestp != enable_stp || { + let mut em = entry.members.clone(); + em.sort(); + em != sorted_members + } { + warn!( + "Bridge {} (uuid={}) config differs — updating", + entry.bridgeif, entry.uuid + ); + } else { + info!( + "Bridge {} (uuid={}) already matches, updating to ensure consistency", + entry.bridgeif, entry.uuid + ); + } + self.api() + .set_item(&entry.uuid, &bridge) + .await + .map_err(Error::Api)?; + self.api().reconfigure().await.map_err(Error::Api)?; + return Ok((entry.uuid.clone(), entry.bridgeif.clone())); + } + + info!( + "Creating bridge with members {:?}, description \"{description}\"", + members + ); + let resp = self.api().add_item(&bridge).await.map_err(Error::Api)?; + self.api().reconfigure().await.map_err(Error::Api)?; + + // OPNsense assigns the `bridgeif` (e.g. `bridge0`) at create time; + // re-list to learn it. + let after = self.list_bridges().await?; + let bridgeif = after + .iter() + .find(|e| e.uuid == resp.uuid) + .map(|e| e.bridgeif.clone()) + .ok_or_else(|| { + Error::Unexpected(format!( + "Bridge {} added but not found in subsequent list", + resp.uuid + )) + })?; + Ok((resp.uuid, bridgeif)) + } + + /// Remove a bridge by UUID. + pub async fn remove_bridge(&self, uuid: &str) -> Result<(), Error> { + info!("Deleting bridge {uuid}"); + self.api().del_item(uuid).await.map_err(Error::Api)?; + self.api().reconfigure().await.map_err(Error::Api)?; + Ok(()) + } + + /// Trigger `reconfigure` without changing config — useful after manual + /// edits. + pub async fn reconfigure(&self) -> Result<(), Error> { + self.api().reconfigure().await.map_err(Error::Api)?; + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct BridgeEntry { + pub uuid: String, + pub bridgeif: String, + pub members: Vec, + pub enablestp: bool, + pub description: String, +} diff --git a/opnsense-config/src/modules/interface_settings.rs b/opnsense-config/src/modules/interface_settings.rs new file mode 100644 index 00000000..f81340b8 --- /dev/null +++ b/opnsense-config/src/modules/interface_settings.rs @@ -0,0 +1,71 @@ +//! `InterfaceSettingsConfig` — singleton wrapper for OPNsense's global +//! `interfaces/settings` model. +//! +//! Today exposes one operation: `ensure_offloads_disabled` — idempotently +//! sets `disablesegmentationoffloading` + `disablelargereceiveoffloading` +//! to `true`. TSO and LRO commonly break `if_bridge` on FreeBSD, so any +//! caller that brings up a bridge should call this first. + +use log::info; +use opnsense_api::generated::interfaces::{InterfacesSettings, InterfacesSettingsResponse}; +use opnsense_api::OpnsenseClient; +use serde::Serialize; + +use crate::Error; + +pub struct InterfaceSettingsConfig { + client: OpnsenseClient, +} + +#[derive(Serialize)] +struct SettingsEnvelope<'a> { + settings: &'a InterfacesSettings, +} + +impl InterfaceSettingsConfig { + pub(crate) fn new(client: OpnsenseClient) -> Self { + Self { client } + } + + /// Fetch the current global interface settings. + pub async fn get(&self) -> Result { + let resp: InterfacesSettingsResponse = self + .client + .get_typed("interfaces", "settings", "get") + .await + .map_err(Error::Api)?; + Ok(resp.settings) + } + + /// Idempotently disable hardware segmentation (TSO) and large-receive + /// (LRO) offload globally. Returns `true` when a write actually + /// happened, `false` when both flags were already set (NOOP). + /// + /// On a fresh OPNsense install both default to `false`; for bridge + /// performance on FreeBSD we want them both `true`. + pub async fn ensure_offloads_disabled(&self) -> Result { + let mut current = self.get().await?; + if current.disablesegmentationoffloading && current.disablelargereceiveoffloading { + return Ok(false); + } + current.disablesegmentationoffloading = true; + current.disablelargereceiveoffloading = true; + info!("Disabling segmentation + LRO offloads via interfaces/settings/set"); + let _: serde_json::Value = self + .client + .post_typed( + "interfaces", + "settings", + "set", + Some(&SettingsEnvelope { settings: ¤t }), + ) + .await + .map_err(Error::Api)?; + let _: serde_json::Value = self + .client + .post_typed("interfaces", "settings", "reconfigure", None::<&()>) + .await + .map_err(Error::Api)?; + Ok(true) + } +} diff --git a/opnsense-config/src/modules/mod.rs b/opnsense-config/src/modules/mod.rs index 4ca778c6..14494abb 100644 --- a/opnsense-config/src/modules/mod.rs +++ b/opnsense-config/src/modules/mod.rs @@ -1,3 +1,4 @@ +pub mod bridge; pub mod caddy; pub mod dhcp; pub mod dhcp_legacy; @@ -5,6 +6,7 @@ pub mod dnat; pub mod dns; pub mod dnsmasq; pub mod firewall; +pub mod interface_settings; pub mod lagg; pub mod load_balancer; pub mod node_exporter; -- 2.39.5 From 2bbd6122777cd23c9c5a3ec4e8734b257123c80e Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:49:58 -0400 Subject: [PATCH 30/38] feat(opnsense): atomic LAN-bridge SSH helper in bootstrap.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `ensure_lan_bridge_atomic_via_ssh`, `ensure_physical_nic_assigned_via_ssh`, `set_lan_member_via_ssh`, and the `AtomicBridgeOutcome` enum. These are the SSH-driven primitives the LAN-bridge step will compose into a single Score in the next commit. The atomic helper runs one PHP-on-SSH script that, in a single `Config::save()`: - promotes unassigned physical NICs to fresh `` entries - moves the current LAN-bound NIC to a new `optN` (when `reassign_lan` is set), so the bridge references the OPT rather than `lan` itself — required to avoid the circular `lan↔bridge0` member reference that silently breaks L2 forwarding when both point at each other - creates or updates `` with the resolved logical members - repoints `` to the bridge The detached configctl chain (`nohup … & --- harmony/src/modules/opnsense/bootstrap.rs | 507 +++++++++++++++++++++- 1 file changed, 506 insertions(+), 1 deletion(-) diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index e538dca5..94d2e655 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -476,7 +476,7 @@ pub async fn probe_https(host: &str, port: u16, timeout: std::time::Duration) -> } /// Build an `SshOPNSenseShell` against `(ip, 22)` using password authentication. -fn opnsense_ssh_shell( +pub(crate) fn opnsense_ssh_shell( ip: std::net::IpAddr, username: &str, password: &str, @@ -750,6 +750,511 @@ echo "OK\n"; Ok(()) } +/// Re-point the LAN logical interface at a different physical / bridge +/// interface (e.g. `bridge0`) by rewriting `` (and +/// optionally ``) via PHP-on-SSH, then applying via +/// **detached** configctl. +/// +/// Why this exists: when a `Score` builds an `if_bridge` spanning several +/// physical NICs, the firewall's LAN services (DHCP, firewall rules, +/// management IP) only see the broadcast domain of *one* NIC unless the +/// LAN logical interface is moved off that raw NIC and onto the bridge. +/// OPNsense's REST API for `interfaces/bridge_settings` creates the +/// bridge device but does NOT touch `` — that's a +/// legacy-config edit. We do it via the supported `Config::getInstance()` +/// SimpleXML path (NOT a raw `file_put_contents` — see the rule in +/// `feedback_opnsense_no_manual_config_xml`). +/// +/// Mirror of [`change_lan_ip_via_ssh`]: same SimpleXML write pattern, +/// same `nohup sh -c "..." &` detach for the configctl chain — without +/// the detach, russh deadlocks because the LAN reconfigure tears down +/// our own SSH channel before the exec returns. +pub async fn set_lan_member_via_ssh( + current_ip: &str, + new_if: &str, + mtu: Option, + username: &str, + password: &str, +) -> Result<(), BootstrapError> { + use opnsense_config::config::OPNsenseShell; + + validate_php_safe(username, "username")?; + validate_php_safe(password, "password")?; + validate_php_safe(new_if, "new_if")?; + + let ip: std::net::IpAddr = current_ip + .parse() + .map_err(|e| BootstrapError::UnexpectedResponse(format!("Invalid current SSH IP: {e}")))?; + let shell = opnsense_ssh_shell(ip, username, password); + + let mtu_line = match mtu { + Some(m) => format!("$root->interfaces->lan->mtu = '{m}';\n"), + None => String::new(), + }; + let php = format!( + r#"object(); +$root->interfaces->lan->if = '{new_if}'; +{mtu_line}$config->save(); +echo "OK\n"; +"#, + ); + + shell + .write_content_to_file(&php, "/tmp/lan_member.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + + info!("LAN member rewrite via SSH: interfaces.lan.if -> {new_if}"); + let out = shell + .exec("php /tmp/lan_member.php && rm /tmp/lan_member.php") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed during interfaces.lan.if rewrite: {e}" + )) + })?; + if out.trim() != "OK" { + return Err(BootstrapError::UnexpectedResponse(format!( + "interfaces.lan.if rewrite via PHP did not report OK; output was: {}", + out.trim() + ))); + } + info!("LAN member rewrite: interfaces.lan saved; scheduling detached apply"); + + // Same detach pattern as `change_lan_ip_via_ssh`: configctl interface + // reconfigure lan kills our SSH channel mid-call, so we nohup the + // chain and let the outer exec return immediately. + let apply_cmd = "sh -c 'nohup sh -c \ + \"sleep 1 && \ + configctl interface reconfigure lan; \ + configctl filter reload\" \ + > /tmp/lan_member.log 2>&1 < /dev/null &'"; + shell.exec(apply_cmd).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed scheduling detached apply: {e}" + )) + })?; + info!( + "LAN member rewrite: detached apply scheduled. Per-step log on the firewall at \ + /tmp/lan_member.log." + ); + + Ok(()) +} + +/// Outcome of the atomic LAN-bridge ensure step. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum AtomicBridgeOutcome { + Created(String), + Updated(String), +} + +impl AtomicBridgeOutcome { + pub fn bridgeif(&self) -> &str { + match self { + AtomicBridgeOutcome::Created(s) | AtomicBridgeOutcome::Updated(s) => s.as_str(), + } + } + pub fn was_created(&self) -> bool { + matches!(self, AtomicBridgeOutcome::Created(_)) + } +} + +/// Ensure a LAN-bridge entry exists AND (when requested) `` +/// is repointed at it — in a **single atomic config save** on the firewall, +/// followed by a **detached** `configctl` chain that brings kernel state in +/// line. +/// +/// Members are passed as **physical NIC names** (`vtnet0`, `igc1`, …). The +/// PHP script resolves each one to a logical interface in the same save: +/// +/// * Already assigned to an OPT (`opt1`, `opt2`, …) → reuse that name. +/// * Unassigned → create a new `` entry (next-free `N`). +/// * Currently `` **and** `reassign_lan=true` → move +/// it to a new OPT entry. Otherwise it stays as `lan`, but a circular +/// reference (bridge has `lan` as a member while `lan.if=bridge0`) means +/// `_interfaces_bridge_configure` resolves the member back to bridge0 and +/// refuses to add it as its own member — the bridge ends up with **zero +/// kernel-level members** and the LAN goes dark. The transfer breaks the +/// circle by giving the physical NIC its own logical handle. +/// +/// Why a single atomic save: any non-atomic ordering leaves a window where +/// either the bridge has no IP (vtnet0 is already in bridge0 kernel-level +/// but `` still says vtnet0) or has no members (bridge created +/// with `members="lan"` while `lan.if=bridge0`). Both windows are +/// unreachable from the LAN. +/// +/// Returns `(action, bridgeif, new_opt_names)`. `action` indicates whether +/// the bridge entry was created or updated; `new_opt_names` lists any OPT +/// entries the PHP script just created so the caller's detached configctl +/// chain can bring them up. +pub async fn ensure_lan_bridge_atomic_via_ssh( + ssh_ip: &std::net::IpAddr, + username: &str, + password: &str, + physical_members: &[String], + description: &str, + enable_stp: bool, + reassign_lan: bool, + mtu: Option, +) -> Result { + use opnsense_config::config::OPNsenseShell; + + validate_php_safe(username, "username")?; + validate_php_safe(password, "password")?; + validate_php_safe(description, "description")?; + for m in physical_members { + validate_php_safe(m, "bridge member")?; + } + let physical_csv = physical_members.join(","); + let enable_stp_str = if enable_stp { "1" } else { "0" }; + let reassign_lan_str = if reassign_lan { "1" } else { "0" }; + let mtu_str = mtu.map(|m| m.to_string()).unwrap_or_default(); + + let shell = opnsense_ssh_shell(*ssh_ip, username, password); + + // Single PHP script: resolve every physical NIC to a logical name + // (creating new OPT entries / moving lan's NIC to a new OPT when + // needed), then write the bridge entry and (if requested) the + // `=bridgeN` reassignment in one `Config::save()`. Output + // format: `ACTION BRIDGEIF NEW_OPTS_CSV` (NEW_OPTS_CSV may be empty). + let php = format!( + r#"object(); +$descr = '{description}'; +$physical_csv = '{physical_csv}'; +$enable_stp = '{enable_stp_str}'; +$reassign_lan = '{reassign_lan_str}'; +$mtu = '{mtu_str}'; + +// Map current physical → logical (and back). +$phys_to_logical = []; +$logical_to_phys = []; +foreach ($root->interfaces->children() as $name => $iface) {{ + $p = (string)$iface->if; + if ($p !== '') {{ + $phys_to_logical[$p] = $name; + $logical_to_phys[$name] = $p; + }} +}} + +$wan_phys = $logical_to_phys['wan'] ?? ''; +$physical_list = array_filter(explode(',', $physical_csv), function ($p) {{ return $p !== ''; }}); +foreach ($physical_list as $p) {{ + if ($wan_phys !== '' && $p === $wan_phys) {{ + echo "ERROR WAN_INCLUDED $p\n"; + exit(1); + }} +}} + +function next_opt($interfaces) {{ + $next = 1; + foreach ($interfaces->children() as $name => $_ignored) {{ + if (preg_match('/^opt(\d+)$/', $name, $m)) {{ + $n = (int)$m[1]; + if ($n >= $next) $next = $n + 1; + }} + }} + return 'opt' . $next; +}} + +function add_opt(&$root, $name, $phys, $descr) {{ + $entry = $root->interfaces->addChild($name); + $entry->addChild('if', $phys); + $entry->addChild('descr', $descr); + $entry->addChild('enable', '1'); +}} + +$new_opts = []; +$resolved = []; +foreach ($physical_list as $phys) {{ + $current = $phys_to_logical[$phys] ?? null; + if ($current === null) {{ + // Unassigned NIC → new OPT. + $name = next_opt($root->interfaces); + add_opt($root, $name, $phys, "harmony bridge member ($phys)"); + $phys_to_logical[$phys] = $name; + $new_opts[] = $name; + $resolved[] = $name; + }} elseif ($current === 'lan' && $reassign_lan === '1') {{ + // Currently LAN AND we're moving LAN to the bridge — create a + // dedicated OPT so the bridge can reference the physical NIC + // without resolving back to itself via lan. + $name = next_opt($root->interfaces); + add_opt($root, $name, $phys, "harmony bridge member ($phys, was lan)"); + $phys_to_logical[$phys] = $name; + $new_opts[] = $name; + $resolved[] = $name; + }} else {{ + // Already has a non-circular logical assignment (opt, etc.). + $resolved[] = $current; + }} +}} +$members_csv = implode(',', $resolved); + +if (!isset($root->bridges)) {{ + $root->addChild('bridges'); +}} + +$found = null; +foreach ($root->bridges->bridged ?? [] as $b) {{ + if ((string)$b->descr === $descr) {{ $found = $b; break; }} +}} + +if ($found !== null) {{ + $bridgeif = (string)$found->bridgeif; + $found->members = $members_csv; + $found->enablestp = $enable_stp; + if (empty((string)$found->proto)) {{ $found->proto = 'rstp'; }} + $action = 'UPDATED'; +}} else {{ + $next = 0; + foreach ($root->bridges->bridged ?? [] as $b) {{ + if (preg_match('/^bridge(\d+)$/', (string)$b->bridgeif, $m)) {{ + $n = (int)$m[1]; + if ($n >= $next) $next = $n + 1; + }} + }} + $bridgeif = "bridge$next"; + $new = $root->bridges->addChild('bridged'); + $new->addChild('bridgeif', $bridgeif); + $new->addChild('members', $members_csv); + $new->addChild('descr', $descr); + $new->addChild('enablestp', $enable_stp); + $new->addChild('proto', 'rstp'); + $action = 'CREATED'; +}} + +if ($reassign_lan === '1') {{ + if ((string)$root->interfaces->lan->if !== $bridgeif) {{ + $root->interfaces->lan->if = $bridgeif; + }} + if ($mtu !== '') {{ $root->interfaces->lan->mtu = $mtu; }} +}} + +$cfg_obj->save(); +echo "$action $bridgeif " . implode(',', $new_opts) . "\n"; +"# + ); + + shell + .write_content_to_file(&php, "/tmp/lan_bridge.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + + let out = shell + .exec("php /tmp/lan_bridge.php && rm /tmp/lan_bridge.php") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed during atomic LAN-bridge save: {e}" + )) + })?; + let out = out.trim(); + if let Some(rest) = out.strip_prefix("ERROR ") { + return Err(BootstrapError::UnexpectedResponse(format!( + "Atomic LAN-bridge script aborted: {rest}" + ))); + } + // Expected: `ACTION BRIDGEIF NEW_OPTS_CSV` (NEW_OPTS_CSV may be empty). + let parts: Vec<&str> = out.splitn(3, ' ').collect(); + let (action, bridgeif, new_opts_csv) = match parts.as_slice() { + [a, b, c] if !a.is_empty() && !b.is_empty() => (*a, b.to_string(), c.to_string()), + [a, b] if !a.is_empty() && !b.is_empty() => (*a, b.to_string(), String::new()), + _ => { + return Err(BootstrapError::UnexpectedResponse(format!( + "Atomic LAN-bridge script did not report ACTION BRIDGEIF [NEW_OPTS_CSV]; got: {out}" + ))); + } + }; + validate_php_safe(&bridgeif, "returned bridgeif")?; + let new_opts: Vec = if new_opts_csv.is_empty() { + Vec::new() + } else { + new_opts_csv + .split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .collect() + }; + for o in &new_opts { + validate_php_safe(o, "new opt name")?; + } + info!( + "LAN-bridge atomic save: {action} {bridgeif} (new opts: {new_opts:?}); scheduling \ + detached apply" + ); + + // Detached configctl chain. Order matters: + // 1. For each new OPT entry: `interface configure ` so the + // logical interface comes up before the bridge tries to use it. + // 2. `interface bridge configure` — create bridge0 in the kernel + // and attach members (`get_real_interface()` resolves to + // the underlying physical NIC). + // 3. `interface reconfigure lan` — move LAN's IP onto bridge0. + // 4. `sshd restart` — sshd was bound to lan's previous interface + // device; without an explicit restart it stays bound to the + // old (now IP-less) NIC and new TCP connects time out. HTTPS + // (lighttpd) doesn't have this problem because the WebUI port + // configuration triggers its own restart elsewhere. + // 5. `filter reload` — reapply pf rules against the new interface + // topology, including the anti-lockout rule for LAN. + // Each step separated by `;` (not `&&`) so a missing action on a + // given OPNsense version doesn't abort the rest. `sleep 1` gives + // our SSH channel time to close cleanly before the LAN may briefly + // drop during step 2. + let mut chain = String::from("sleep 1"); + for o in &new_opts { + chain.push_str(&format!(" && configctl interface configure {o} || true")); + } + chain.push_str( + " ; configctl interface bridge configure; \ + configctl interface reconfigure lan; \ + configctl sshd restart; \ + configctl filter reload", + ); + let apply = format!( + "sh -c 'nohup sh -c \"{chain}\" > /tmp/lan_bridge.log 2>&1 < /dev/null &'", + ); + shell.exec(&apply).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed scheduling detached apply: {e}" + )) + })?; + info!( + "LAN-bridge atomic apply scheduled. Per-step log on the firewall at /tmp/lan_bridge.log." + ); + + Ok(match action { + "CREATED" => AtomicBridgeOutcome::Created(bridgeif), + _ => AtomicBridgeOutcome::Updated(bridgeif), + }) +} + +/// Ensure a physical NIC is registered as a logical interface. Returns +/// the logical name (`lan`, `opt1`, `opt2`, …) it is bound to. +/// +/// Idempotent — if `physical_nic` already appears as `/*/if`, +/// returns that logical name with no changes (no write, no reconfigure). +/// Otherwise writes a new `` entry with the next free +/// `optN` number, `{physical_nic}`, `1`, and +/// `{description}` (default: `"harmony bridge member +/// ()"`). Then runs `configctl interface configure {optN}` so +/// OPNsense brings the new device up. +/// +/// Why this exists: OPNsense's bridge model only accepts logical +/// interface names. A pico-DC operator selecting raw NICs (e.g. +/// `igc2`, `igc3`) needs each one to be assigned to an OPT slot first. +/// This helper closes that gap for callers like +/// [`crate::modules::opnsense::lan_bridge::OPNsenseLanBridgeScore`]. +/// +/// No detached-configctl trick here — configuring a fresh OPT does +/// not tear down our SSH channel (we're talking over LAN, the new +/// device is untouched). +pub async fn ensure_physical_nic_assigned_via_ssh( + ip: &std::net::IpAddr, + username: &str, + password: &str, + physical_nic: &str, + description: Option<&str>, +) -> Result { + use opnsense_config::config::OPNsenseShell; + + validate_php_safe(username, "username")?; + validate_php_safe(password, "password")?; + validate_php_safe(physical_nic, "physical_nic")?; + let descr = description + .map(|s| s.to_string()) + .unwrap_or_else(|| format!("harmony bridge member ({physical_nic})")); + validate_php_safe(&descr, "description")?; + + let shell = opnsense_ssh_shell(*ip, username, password); + + // The script: find the existing logical name for $nic, or assign it + // to the next free optN. Prints `EXISTS ` or `ASSIGNED ` + // so the Rust side can decide whether to reconfigure. + let php = format!( + r#"object(); +$nic = '{physical_nic}'; +$descr = '{descr}'; + +// 1. Look for an existing assignment of $nic. +foreach ($root->interfaces->children() as $name => $iface) {{ + if ((string)$iface->if === $nic) {{ + echo "EXISTS " . $name . "\n"; + exit(0); + }} +}} + +// 2. Find the next free optN. +$next = 1; +foreach ($root->interfaces->children() as $name => $_iface) {{ + if (preg_match('/^opt(\d+)$/', $name, $m)) {{ + $n = intval($m[1]); + if ($n >= $next) {{ $next = $n + 1; }} + }} +}} +$new = 'opt' . $next; + +// 3. Add the new entry with minimum WebUI-equivalent schema. +$entry = $root->interfaces->addChild($new); +$entry->addChild('if', $nic); +$entry->addChild('descr', $descr); +$entry->addChild('enable', '1'); + +$config->save(); +echo "ASSIGNED " . $new . "\n"; +"# + ); + + shell + .write_content_to_file(&php, "/tmp/assign_nic.php") + .await + .map_err(|e| BootstrapError::UnexpectedResponse(format!("SFTP upload failed: {e}")))?; + + let out = shell + .exec("php /tmp/assign_nic.php && rm /tmp/assign_nic.php") + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "SSH exec failed during NIC assignment: {e}" + )) + })?; + let out = out.trim(); + if let Some(name) = out.strip_prefix("EXISTS ") { + let name = name.trim().to_string(); + info!("NIC assignment: {physical_nic} already = {name} (NOOP)"); + return Ok(name); + } + let new = out + .strip_prefix("ASSIGNED ") + .ok_or_else(|| { + BootstrapError::UnexpectedResponse(format!( + "Unexpected output from NIC-assignment script: {out}" + )) + })? + .trim() + .to_string(); + validate_php_safe(&new, "assigned logical name")?; + info!("NIC assignment: {physical_nic} → {new}; running configctl interface configure"); + let apply = format!("configctl interface configure {new}"); + shell.exec(&apply).await.map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "configctl interface configure {new} failed: {e}" + )) + })?; + Ok(new) +} + /// The vendored upstream `ethname` rc.d script (MIT, © Eric Borisch /// 2016–2019, frozen since v2.0.1 in March 2020). The Score /// `OPNsenseBootstrapScore` SFTPs this onto every firewall it -- 2.39.5 From bd86fffae7c64785d21cec584a39f92e6f78fe9e Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:50:12 -0400 Subject: [PATCH 31/38] feat(opnsense): OPNsenseLanBridgeScore + bootstrap_score.lan_bridge field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the standalone `OPNsenseLanBridgeScore` (Score) plus a new `lan_bridge: Option` field on `OPNsenseBootstrapScore`. Both call the shared `ensure_lan_bridge_step` so behaviour stays in lockstep. `LanBridgeParams::members` takes **physical** NIC names (e.g. `["igc0", "igc2","igc3","igc4"]`). The Score resolves them to logical interfaces, auto-promoting unmapped NICs to fresh `optN` slots before bridging. WAN's NIC is rejected with a hard error. `members: None` triggers an interactive `MultiSelect` annotated with each NIC's current logical assignment. Inside `OPNsenseBootstrapScore`, the bridge step runs AFTER the firmware upgrade and BEFORE the optional LAN-IP rebind — so a `target_lan` rebind naturally targets `bridge0` rather than the now-unbound physical port. Defaults: `reassign_lan=true`, `perf_tunables=true`, `enable_stp=false`. Perf tunables run BEFORE the bridge create so `net.link.bridge.inherit_mac=1` is live when the first member is attached (otherwise the bridge gets an auto-generated MAC and the host's L2 path silently breaks after the LAN-IP move). Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/domain/interpret/mod.rs | 2 + .../src/modules/opnsense/bootstrap_score.rs | 58 +- harmony/src/modules/opnsense/lan_bridge.rs | 609 ++++++++++++++++++ harmony/src/modules/opnsense/mod.rs | 1 + 4 files changed, 669 insertions(+), 1 deletion(-) create mode 100644 harmony/src/modules/opnsense/lan_bridge.rs diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index 0ddbfa19..f992cf93 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -15,6 +15,7 @@ pub enum InterpretName { OPNsenseFirmwareUpgrade, OPNsensePackageInstall, OPNsensePinNicNames, + OPNsenseLanBridge, LoadBalancer, Tftp, Http, @@ -52,6 +53,7 @@ impl std::fmt::Display for InterpretName { InterpretName::OPNsenseFirmwareUpgrade => f.write_str("OPNsenseFirmwareUpgrade"), InterpretName::OPNsensePackageInstall => f.write_str("OPNsensePackageInstall"), InterpretName::OPNsensePinNicNames => f.write_str("OPNsensePinNicNames"), + InterpretName::OPNsenseLanBridge => f.write_str("OPNsenseLanBridge"), InterpretName::LoadBalancer => f.write_str("LoadBalancer"), InterpretName::Tftp => f.write_str("Tftp"), InterpretName::Http => f.write_str("Http"), diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 7d71abcd..2a0b96f9 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -20,7 +20,15 @@ //! [`OPNsenseFirmwareUpgradeScore`](crate::modules::opnsense::firmware_upgrade::OPNsenseFirmwareUpgradeScore). //! Configurable via `FirmwareUpgradeMode` (Auto / AutoMinor / Prompt / //! Disabled). -//! 6. Optionally rebinds the LAN to a new IP/subnet. +//! 6. **(Optional, via `lan_bridge`)** Creates an `if_bridge` spanning +//! the selected physical NICs and re-points `` +//! at it. Shares +//! [`ensure_lan_bridge_step`](crate::modules::opnsense::lan_bridge::ensure_lan_bridge_step) +//! with the standalone +//! [`OPNsenseLanBridgeScore`](crate::modules::opnsense::lan_bridge::OPNsenseLanBridgeScore). +//! Runs AFTER firmware upgrade (so the bridge lives in the final +//! firmware's config schema) and BEFORE the optional LAN-IP rebind. +//! 7. Optionally rebinds the LAN to a new IP/subnet. //! //! After it runs, callers construct a normal //! [`OPNSenseFirewall`](crate::infra::opnsense::OPNSenseFirewall) from the @@ -51,6 +59,7 @@ use crate::{ create_api_key_ssh, probe_https, set_lan_dhcp_range_via_api, }, modules::opnsense::firmware_upgrade::{FirmwareUpgradeMode, perform_firmware_upgrade}, + modules::opnsense::lan_bridge::{LanBridgeParams, ensure_lan_bridge_step}, modules::opnsense::pin_nic_names::pin_nic_names_step, score::Score, topology::OPNsenseBootstrapTopology, @@ -90,6 +99,15 @@ pub struct OPNsenseBootstrapScore { /// lives in /// [`crate::modules::opnsense::firmware_upgrade::perform_firmware_upgrade`]. pub firmware_upgrade: FirmwareUpgradeMode, + /// Optional `if_bridge` step. When `Some(_)`, creates a bridge with + /// the given members AFTER firmware upgrade and BEFORE the optional + /// LAN-IP rebind below. Re-points `` at the + /// bridge so the rebind (if any) targets the bridge interface. + /// When `None`, the bridge step is skipped entirely. Shares + /// [`ensure_lan_bridge_step`](crate::modules::opnsense::lan_bridge::ensure_lan_bridge_step) + /// with the standalone + /// [`OPNsenseLanBridgeScore`](crate::modules::opnsense::lan_bridge::OPNsenseLanBridgeScore). + pub lan_bridge: Option, } impl Default for OPNsenseBootstrapScore { @@ -100,6 +118,7 @@ impl Default for OPNsenseBootstrapScore { webgui_ready_timeout: std::time::Duration::from_secs(120), disable_http_redirect: false, firmware_upgrade: FirmwareUpgradeMode::Auto, + lan_bridge: None, } } } @@ -364,6 +383,43 @@ impl Interpret for OPNsenseBootstrapInterpret { info!("{tag} firmware_upgrade=Disabled; skipping firmware upgrade"); } + // ── Step 4.5: optional LAN bridge ──────────────────────────── + // Shares `ensure_lan_bridge_step` with the standalone + // `OPNsenseLanBridgeScore`. Runs AFTER firmware upgrade (so the + // bridge lives in the final firmware's config schema) and + // BEFORE the LAN-IP rebind below (so the rebind targets the + // bridge, not the raw LAN NIC). + if let Some(params) = self.score.lan_bridge.clone() { + info!( + "{tag} LAN bridge step — members={:?}, reassign_lan={}, perf_tunables={}", + params.members, params.reassign_lan, params.perf_tunables + ); + let bridge_config = opnsense_config::Config::from_credentials_with_api_port( + topology.vanilla_ip, + None, + self.score.target_api_port, + &key, + &secret, + &topology.default_username, + &topology.default_password, + ) + .await + .map_err(|e| { + InterpretError::new(format!( + "Failed to build OPNsense Config for LAN bridge step: {e}" + )) + })?; + ensure_lan_bridge_step( + &bridge_config, + &topology.vanilla_ip, + &topology.default_username, + &topology.default_password, + ¶ms, + &tag, + ) + .await?; + } + // ── Step 5: optional LAN rebind ────────────────────────────── if let Some(rebind) = &self.score.target_lan { info!( diff --git a/harmony/src/modules/opnsense/lan_bridge.rs b/harmony/src/modules/opnsense/lan_bridge.rs new file mode 100644 index 00000000..a4e71c76 --- /dev/null +++ b/harmony/src/modules/opnsense/lan_bridge.rs @@ -0,0 +1,609 @@ +//! `OPNsenseLanBridgeScore` — single `if_bridge(4)` spanning logical interfaces. +//! +//! # Why this exists +//! +//! Built for the **pico-DC** topology (1× OPNsense + N hyperconverged +//! nodes, no physical switch). To get L2 connectivity between every +//! node and the firewall's own LAN services (DHCP, firewall, management +//! IP), OPNsense itself becomes the L2 fabric — an `if_bridge` spanning +//! the selected ports. On low-CPU hardware like the Wize 5070 the Score +//! also tunes a handful of `net.link.bridge.*` sysctls and disables +//! TSO/LRO globally (those break `if_bridge` on FreeBSD). +//! +//! Optionally re-points `` at the new bridge so +//! the LAN logical interface (and everything that hangs off it) spans +//! every member NIC. +//! +//! # Members are physical NIC names; the Score auto-assigns OPT slots +//! +//! Callers pass **physical NIC names** (`vtnet0`, `igc1`, …) — what an +//! operator sees on the hardware. The Score then: +//! +//! 1. Looks each NIC up in ``. If it's already assigned to +//! a logical name (`lan`, `opt1`, …), that logical name is reused. +//! 2. If the NIC has no logical assignment yet, the Score adds a new +//! `` entry over SSH (next free `optN`, with +//! `=`, `1`, plus a sensible ``) and brings +//! it up via `configctl interface configure `. The actual +//! bridge model still receives the logical name (OPNsense's +//! `BridgeMemberField` rejects raw NIC names — that's why this +//! translation exists). +//! 3. The WAN port (``) is rejected up-front as a +//! member; a clear error is returned if the caller includes it. +//! +//! The pico-DC happy path: the operator's hardware has `lan` + `wan` +//! assigned (from the first-time wizard) and three unassigned PCIe +//! ports. They pass `members: Some(vec!["igc0","igc2","igc3","igc4"])` +//! (with `igc1` as WAN). After the Score runs they see `lan` + new +//! `opt1`/`opt2`/`opt3` entries in WebUI ▸ Interfaces ▸ Assignments, +//! plus `bridge0` spanning all four logical interfaces. +//! +//! # Two ways to use this +//! +//! * **Automatic.** [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore) +//! composes [`ensure_lan_bridge_step`] internally when its +//! `lan_bridge: Option` field is `Some(_)`. Lives +//! between the firmware-upgrade and LAN-IP-rebind steps so the bridge +//! exists before any optional LAN-IP flip lands on it. +//! * **Standalone.** [`OPNsenseLanBridgeScore`] is a Score in its own +//! right (`Score`) — drop it into a normal +//! post-bootstrap Vec when configuring a firewall after the bootstrap +//! has already happened. + +use async_trait::async_trait; +use harmony_secret::SecretManager; +use harmony_types::id::Id; +use log::info; +use serde::Serialize; + +use crate::{ + config::secret::OPNSenseFirewallCredentials, + data::Version, + infra::opnsense::OPNSenseFirewall, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::opnsense::bootstrap::{ + DEFAULT_PHYSICAL_DRIVER_PREFIXES, ensure_lan_bridge_atomic_via_ssh, + list_physical_nics_via_ssh, opnsense_ssh_shell, + }, + score::Score, +}; + +/// Score parameters shared between the standalone Score and the +/// built-in step inside `OPNsenseBootstrapScore`. +#[derive(Debug, Clone, Serialize)] +pub struct LanBridgeParams { + /// **Physical NIC names** to add to the bridge (e.g. + /// `["igc0","igc2","igc3","igc4"]` or `["vtnet0"]` in a VM). The + /// Score translates each one to a logical interface name before + /// sending it to OPNsense's bridge model — unassigned NICs are + /// auto-promoted to the next free `optN` slot. + /// + /// Including the WAN port (whatever NIC backs ``) + /// is rejected with a hard error. + /// + /// `None` triggers an interactive `inquire::MultiSelect` over the + /// firewall's physical NICs (WAN excluded), each annotated with + /// its current logical assignment ("igc0 [lan]", "igc2 + /// [unassigned]", …). + pub members: Option>, + /// Bridge description (canonical identity for idempotency match). + pub description: String, + /// Optional MTU. Written to `` since the + /// OPNsense bridge model has no MTU field of its own. + pub mtu: Option, + /// Spanning Tree Protocol. Default `false` for point-to-point pico + /// DC (no redundant paths → no loops → STP just adds CPU overhead). + pub enable_stp: bool, + /// When `true`, re-point `` at the new bridge + /// after creation. Default `true`. + pub reassign_lan: bool, + /// When `true`, write opinionated `net.link.bridge.*` sysctls and + /// disable TSO/LRO globally. Default `true`. Required for any + /// reasonable bridge performance on low-CPU hardware. + pub perf_tunables: bool, +} + +impl Default for LanBridgeParams { + fn default() -> Self { + Self { + members: None, + description: "LAN bridge".to_string(), + mtu: None, + enable_stp: false, + reassign_lan: true, + perf_tunables: true, + } + } +} + +/// Result of running the bridge step. +#[derive(Debug, Clone)] +pub enum BridgeOutcome { + /// Bridge did not exist before; we created it. + Created { + bridgeif: String, + members: Vec, + }, + /// A matching bridge already existed; we wrote-through to ensure + /// drift convergence (the REST API treats this as a noop when the + /// payload matches what's already stored). + Updated { + bridgeif: String, + members: Vec, + }, +} + +/// Shared implementation of the LAN-bridge step. +/// +/// Used by both [`OPNsenseLanBridgeScore`] and +/// [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore). +/// Both callers pass the same `LanBridgeParams` so the behaviour stays +/// in lockstep — there is no second implementation to drift. +pub async fn ensure_lan_bridge_step( + config: &opnsense_config::Config, + ssh_ip: &std::net::IpAddr, + ssh_user: &str, + ssh_pass: &str, + params: &LanBridgeParams, + tag: &str, +) -> Result { + // ── 1. Resolve physical NICs ─────────────────────────────────── + let physical_members = match ¶ms.members { + Some(ms) if !ms.is_empty() => ms.clone(), + Some(_) => { + return Err(InterpretError::new( + "OPNsenseLanBridgeScore: explicit `members` list is empty".into(), + )); + } + None => prompt_bridge_members(ssh_ip, ssh_user, ssh_pass, tag).await?, + }; + if physical_members.is_empty() { + return Err(InterpretError::new( + "OPNsenseLanBridgeScore: no bridge members selected".into(), + )); + } + + // ── 1a. Reject WAN ────────────────────────────────────────────── + let wan_phys = read_iface_if_via_ssh(ssh_ip, ssh_user, ssh_pass, "wan") + .await + .unwrap_or_default(); + if !wan_phys.is_empty() { + for phys in &physical_members { + if phys == &wan_phys { + return Err(InterpretError::new(format!( + "{phys} is the WAN port (interfaces.wan.if); refusing to add it \ + to a LAN bridge. Drop it from `members` and re-run." + ))); + } + } + } + + // ── 1b. Performance tunables (BEFORE bridge create) ───────────── + // `net.link.bridge.inherit_mac=1` only applies to bridges that + // attach a member AFTER the sysctl is set. If we ensure the bridge + // first, bridge0 has its own auto-generated MAC and the host's + // ARP/L2 path is silently broken once LAN's IP moves over. Set + // the sysctls first; the bridge then inherits the first member's + // MAC on creation. The other three sysctls (pfil_*) are pf-related + // and ordering-insensitive, but moving them too keeps the block + // atomic. + if params.perf_tunables { + ensure_bridge_sysctls(config, tag).await?; + ensure_offloads_disabled(config, tag).await?; + } else { + info!("{tag} perf_tunables=false; skipping bridge sysctls and offload toggles"); + } + + info!( + "{tag} Atomic bridge-save: descr=\"{}\", physical_members={:?}, reassign_lan={}", + params.description, physical_members, params.reassign_lan + ); + + // ── 2. Atomic resolve + bridge + (optional) LAN reassignment ── + // The helper takes physical NIC names and does the + // physical→logical resolution INSIDE one PHP `Config::save()` so + // every change (new OPT entries, bridge entry, lan.if=bridgeN) + // lands atomically. Splitting into separate steps creates a window + // either with bridge having no kernel members (circular `lan` + // reference when reassign_lan=true) or with vtnet0 already a + // bridge member while lan still claims it. See + // `crate::modules::opnsense::bootstrap::ensure_lan_bridge_atomic_via_ssh` + // for the full rationale and the rules for which members get a + // dedicated OPT slot vs. reusing their existing logical name. + let outcome = ensure_lan_bridge_atomic_via_ssh( + ssh_ip, + ssh_user, + ssh_pass, + &physical_members, + ¶ms.description, + params.enable_stp, + params.reassign_lan, + params.mtu, + ) + .await + .map_err(|e| InterpretError::new(format!("atomic LAN-bridge save failed: {e}")))?; + let bridgeif = outcome.bridgeif().to_string(); + info!( + "{tag} Bridge `{bridgeif}` {} (reassign_lan={})", + if outcome.was_created() { + "created" + } else { + "updated" + }, + params.reassign_lan + ); + + Ok(if outcome.was_created() { + BridgeOutcome::Created { + bridgeif, + members: physical_members, + } + } else { + BridgeOutcome::Updated { + bridgeif, + members: physical_members, + } + }) +} + +// ─── Private helpers ─────────────────────────────────────────────────── + +async fn prompt_bridge_members( + ip: &std::net::IpAddr, + user: &str, + pass: &str, + tag: &str, +) -> Result, InterpretError> { + info!("{tag} Enumerating physical NICs to offer for bridge membership"); + let nics = list_physical_nics_via_ssh(ip, user, pass, DEFAULT_PHYSICAL_DRIVER_PREFIXES) + .await + .map_err(|e| InterpretError::new(format!("physical-NIC enumeration failed: {e}")))?; + if nics.is_empty() { + return Err(InterpretError::new( + "no physical NICs detected via `ifconfig -l ether` — extend \ + DEFAULT_PHYSICAL_DRIVER_PREFIXES if your hardware uses an exotic driver" + .into(), + )); + } + + // Read current logical assignments so we can annotate each NIC. + let assignments = list_logical_interfaces_via_ssh(ip, user, pass) + .await + .unwrap_or_default(); + // Reverse map: physical NIC → logical name (e.g. "vtnet0" → "lan"). + let phys_to_logical: std::collections::HashMap = assignments + .iter() + .filter(|(_, phys)| !phys.is_empty()) + .map(|(name, phys)| (phys.clone(), name.clone())) + .collect(); + let wan_phys = assignments + .iter() + .find(|(name, _)| name == "wan") + .map(|(_, phys)| phys.clone()) + .unwrap_or_default(); + + // Drop the WAN port from the candidate list entirely. Anything else + // is a candidate, whether already assigned (will reuse the existing + // logical name) or unassigned (will get a new opt slot during the + // resolution step in `ensure_lan_bridge_step`). + let candidates: Vec<(String, String)> = nics + .into_iter() + .filter(|(name, _mac)| wan_phys.is_empty() || name != &wan_phys) + .map(|(name, _mac)| { + let annotation = phys_to_logical + .get(&name) + .map(String::as_str) + .unwrap_or("unassigned"); + let display = format!("{name} [{annotation}]"); + (display, name) + }) + .collect(); + if candidates.is_empty() { + return Err(InterpretError::new( + "no eligible bridge members left after filtering out the WAN port".into(), + )); + } + + let display_refs: Vec<&str> = candidates.iter().map(|(d, _)| d.as_str()).collect(); + let selected = inquire::MultiSelect::new( + "Select physical NICs to bridge for LAN (WAN excluded; unassigned will get a new OPT slot):", + display_refs, + ) + .prompt() + .map_err(|e| InterpretError::new(format!("interactive bridge-member selection failed: {e}")))?; + + Ok(candidates + .iter() + .filter(|(display, _)| selected.contains(&display.as_str())) + .map(|(_, name)| name.clone()) + .collect()) +} + +/// Enumerate all logical interfaces from `` plus their +/// backing physical NIC (``). Returns `[(logical_name, +/// physical_if), ...]` — e.g. `[("wan","vtnet1"), ("lan","vtnet0"), +/// ("opt1","igc2"), ...]`. Used by the interactive `MultiSelect` +/// prompt; the display shows both for clarity. +async fn list_logical_interfaces_via_ssh( + ip: &std::net::IpAddr, + user: &str, + pass: &str, +) -> Result, String> { + use opnsense_config::config::OPNsenseShell; + let shell = opnsense_ssh_shell(*ip, user, pass); + // Plain `name=if` pairs, one per line. tcsh-friendly: no inline `if/then/else`. + // NOTE on backslashes: shell single-quotes preserve `\` literally, so a + // single backslash in the Rust source IS what PHP parses. Doubling + // them produced `OPNsense\\Core\\Config` in PHP source which is a + // parse error (two consecutive separators), making `php -r` exit + // silently with empty stdout — caller can't tell apart "field missing" + // from "script never ran". + let php = "php -r 'require \"/usr/local/etc/inc/config.inc\"; \ + foreach (OPNsense\\Core\\Config::getInstance()->object()->interfaces->children() as $k => $v) { \ + echo $k . \"=\" . ((string)$v->if) . \"\\n\"; \ + }'"; + let out = shell + .exec(php) + .await + .map_err(|e| format!("ssh exec: {e}"))?; + let pairs = out + .lines() + .filter_map(|line| { + let line = line.trim(); + if line.is_empty() { + return None; + } + let (k, v) = line.split_once('=')?; + Some((k.trim().to_string(), v.trim().to_string())) + }) + .collect(); + Ok(pairs) +} + +/// Read `<{name}>` over SSH via PHP+SimpleXML through +/// the `Config` singleton (no manual config.xml edits). Returns the +/// physical NIC name bound to the named logical interface — e.g. +/// `"vtnet1"` for `wan`, `"bridge0"` for `lan` after reassignment. +async fn read_iface_if_via_ssh( + ip: &std::net::IpAddr, + user: &str, + pass: &str, + iface_name: &str, +) -> Result { + use opnsense_config::config::OPNsenseShell; + let shell = opnsense_ssh_shell(*ip, user, pass); + // Single `\` between namespace segments — shell single-quotes preserve + // backslashes literally, so this reaches PHP as `OPNsense\Core\Config`. + let php = format!( + "php -r 'require \"/usr/local/etc/inc/config.inc\"; \ + echo (string)OPNsense\\Core\\Config::getInstance()->object()->interfaces->{iface_name}->if;'" + ); + let out = shell + .exec(&php) + .await + .map_err(|e| format!("ssh exec: {e}"))?; + Ok(out.trim().to_string()) +} + +/// Write the four `net.link.bridge.*` sysctls through OPNsense's +/// `/api/core/tunables/*` endpoints — idempotent (no rewrite when the +/// value already matches). +async fn ensure_bridge_sysctls( + config: &opnsense_config::Config, + tag: &str, +) -> Result<(), InterpretError> { + const SYSCTLS: &[(&str, &str, &str)] = &[ + ( + "net.link.bridge.pfil_member", + "0", + "harmony: bridge perf — do not pf on member NICs", + ), + ( + "net.link.bridge.pfil_bridge", + "1", + "harmony: bridge perf — pf on bridge interface only", + ), + ( + "net.link.bridge.pfil_local_phys", + "0", + "harmony: bridge perf — do not pf local traffic on members", + ), + ( + "net.link.bridge.inherit_mac", + "1", + "harmony: bridge inherits MAC of first member", + ), + ]; + + let client = config.client(); + let mut changed = 0usize; + for (tunable, value, descr) in SYSCTLS { + // Search for an existing row with this tunable name. + let search: serde_json::Value = client + .post_typed( + "core", + "tunables", + "searchItem", + Some(&serde_json::json!({ "searchPhrase": tunable })), + ) + .await + .map_err(|e| InterpretError::new(format!("tunable searchItem({tunable}): {e}")))?; + + let existing = search["rows"].as_array().and_then(|rows| { + rows.iter() + .find(|r| r["tunable"].as_str() == Some(*tunable)) + }); + + let body = serde_json::json!({ + "sysctl": { "tunable": tunable, "value": value, "descr": descr }, + }); + + match existing { + Some(row) => { + let uuid = row["uuid"].as_str().unwrap_or("").to_string(); + let cur_value = row["value"].as_str().unwrap_or("").to_string(); + if cur_value == *value { + continue; + } + let _: serde_json::Value = client + .post_typed("core", "tunables", &format!("setItem/{uuid}"), Some(&body)) + .await + .map_err(|e| InterpretError::new(format!("tunable setItem({tunable}): {e}")))?; + changed += 1; + } + None => { + let _: serde_json::Value = client + .post_typed("core", "tunables", "addItem", Some(&body)) + .await + .map_err(|e| InterpretError::new(format!("tunable addItem({tunable}): {e}")))?; + changed += 1; + } + } + } + + if changed > 0 { + let _: serde_json::Value = client + .post_typed("core", "tunables", "reconfigure", None::<&()>) + .await + .map_err(|e| InterpretError::new(format!("tunables reconfigure: {e}")))?; + info!("{tag} Wrote {changed} bridge sysctl(s) and reconfigured tunables"); + } else { + info!("{tag} NOOP — all 4 bridge sysctls already match desired values"); + } + Ok(()) +} + +async fn ensure_offloads_disabled( + config: &opnsense_config::Config, + tag: &str, +) -> Result<(), InterpretError> { + let changed = config + .interface_settings() + .ensure_offloads_disabled() + .await + .map_err(|e| InterpretError::new(format!("offload toggles: {e}")))?; + if changed { + info!("{tag} Disabled hardware TSO + LRO offloads globally"); + } else { + info!("{tag} NOOP — TSO + LRO already disabled globally"); + } + Ok(()) +} + +// ─── Standalone Score ────────────────────────────────────────────────── + +/// Standalone Score over [`OPNSenseFirewall`] — composes the same +/// [`ensure_lan_bridge_step`] used internally by +/// [`OPNsenseBootstrapScore`](super::bootstrap_score::OPNsenseBootstrapScore). +#[derive(Debug, Clone, Default, Serialize)] +pub struct OPNsenseLanBridgeScore { + pub params: LanBridgeParams, +} + +impl Score for OPNsenseLanBridgeScore { + fn name(&self) -> String { + "OPNsenseLanBridgeScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OPNsenseLanBridgeInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug)] +struct OPNsenseLanBridgeInterpret { + score: OPNsenseLanBridgeScore, +} + +#[async_trait] +impl Interpret for OPNsenseLanBridgeInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &OPNSenseFirewall, + ) -> Result { + let ip: std::net::IpAddr = topology.get_ip(); + let tag = format!("[OPNsenseLanBridge/{ip}]"); + + let config = topology.get_opnsense_config(); + let ssh_creds = SecretManager::get::() + .await + .map_err(|e| { + InterpretError::new(format!( + "OPNsenseLanBridgeScore needs OPNSenseFirewallCredentials in SecretManager \ + (run OPNsenseBootstrapScore first): {e}" + )) + })?; + + let outcome = ensure_lan_bridge_step( + &config, + &ip, + &ssh_creds.username, + &ssh_creds.password, + &self.score.params, + &tag, + ) + .await?; + + let message = match &outcome { + BridgeOutcome::Created { bridgeif, members } => { + format!("Created bridge {bridgeif} with {} member(s)", members.len()) + } + BridgeOutcome::Updated { bridgeif, members } => { + format!("Updated bridge {bridgeif} ({} member(s))", members.len()) + } + }; + Ok(Outcome::success(message)) + } + + fn get_name(&self) -> InterpretName { + InterpretName::OPNsenseLanBridge + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::QUEUED + } + + fn get_children(&self) -> Vec { + vec![] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_score_name() { + let s = OPNsenseLanBridgeScore::default(); + assert_eq!( + >::name(&s), + "OPNsenseLanBridgeScore" + ); + } + + #[test] + fn test_score_serializes() { + let s = OPNsenseLanBridgeScore::default(); + let _: serde_value::Value = + serde_value::to_value(&s).expect("OPNsenseLanBridgeScore should serialize"); + } + + #[test] + fn test_default_params() { + let p = LanBridgeParams::default(); + assert_eq!(p.description, "LAN bridge"); + assert!(!p.enable_stp); + assert!(p.reassign_lan); + assert!(p.perf_tunables); + assert!(p.members.is_none()); + assert!(p.mtu.is_none()); + } +} diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index c3b65793..b53a529d 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -5,6 +5,7 @@ pub mod firewall; pub mod firmware_upgrade; pub mod image; pub mod lagg; +pub mod lan_bridge; pub mod node_exporter; pub mod package_install; pub mod pin_nic_names; -- 2.39.5 From 46dca31108068f2df7aaf91cb2a4414ff6ce8148 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 08:50:23 -0400 Subject: [PATCH 32/38] test(opnsense-vm-integration): exercise OPNsenseLanBridgeScore end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the LAN-bridge step at position #12 in the score pipeline (single-member bridge on `vtnet0`), plus end-of-test reachability assertions for HTTPS at 9443 and SSH at 22 — both must succeed before the test reports PASS. The SSH check guards specifically against the regression where sshd stays bound to the old (now IP-less) LAN device after the bridge step and the next run times out trying to reconnect. State snapshot now captures `net.link.bridge.*` sysctl values and asserts each expected key (`inherit_mac`, `pfil_member`, `pfil_bridge`, `pfil_local_phys`) has the expected value, rather than just counting ≥4 entries (the namespace isn't owned exclusively by the Score). Verified PASSED end-to-end on three sequential runs (clean run, idempotent re-run, post-reboot probe). Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/opnsense_vm_integration/src/main.rs | 229 +++++++++++++++++++ 1 file changed, 229 insertions(+) diff --git a/examples/opnsense_vm_integration/src/main.rs b/examples/opnsense_vm_integration/src/main.rs index f9a18863..344b983d 100644 --- a/examples/opnsense_vm_integration/src/main.rs +++ b/examples/opnsense_vm_integration/src/main.rs @@ -50,6 +50,7 @@ use harmony::modules::opnsense::firmware_upgrade::{ FirmwareUpgradeMode, OPNsenseFirmwareUpgradeScore, }; use harmony::modules::opnsense::lagg::{LaggDef, LaggScore}; +use harmony::modules::opnsense::lan_bridge::{LanBridgeParams, OPNsenseLanBridgeScore}; use harmony::modules::opnsense::node_exporter::NodeExporterScore; use harmony::modules::opnsense::package_install::OPNsensePackageInstallScore; use harmony::modules::opnsense::vip::{VipDef, VipScore}; @@ -388,6 +389,71 @@ async fn run_integration() -> Result<(), Box> { "LAGGs changed after 2nd run! {} -> {}", state1.lagg_count, state2.lagg_count ); + assert_eq!( + state1.bridge_count, state2.bridge_count, + "Bridges changed after 2nd run! {} -> {}", + state1.bridge_count, state2.bridge_count + ); + assert_eq!( + state1.bridge_sysctls, state2.bridge_sysctls, + "net.link.bridge.* sysctl count changed after 2nd run! {} -> {}", + state1.bridge_sysctls, state2.bridge_sysctls + ); + assert_eq!( + state1.lan_if, state2.lan_if, + "interfaces.lan.if changed after 2nd run! {} -> {}", + state1.lan_if, state2.lan_if + ); + + // ── Reachability assertion ───────────────────────────────────── + // The bridge step re-points at bridge0; a + // wrong sysctl ordering, missing service restart, or bad MAC + // inheritance can break individual services without taking down + // the whole stack. Verify BOTH HTTPS (lighttpd) AND SSH (sshd) + // come back up: HTTPS uses the webgui-port settings (own restart + // path) while SSH binds per-interface and needs `configctl sshd + // restart` after LAN's moves to bridge0 — if that step is + // missing, HTTPS stays green but SSH-based Scores time out on + // any rerun. Generous timeouts because the detached configctl + // chain takes a beat to fully settle. + info!("Verifying firewall HTTPS reachability post-run on {OPN_LAN_IP}:{OPN_API_PORT}..."); + wait_for_https(OPN_LAN_IP, OPN_API_PORT) + .await + .map_err(|e| -> Box { + format!( + "Firewall HTTPS at {OPN_LAN_IP}:{OPN_API_PORT} is unreachable after the Score \ + pipeline: {e}. The bridge / LAN reassignment likely broke L2 (check MAC \ + inheritance via net.link.bridge.inherit_mac=1 BEFORE bridge member is added, \ + and confirm `` ended at `bridge0`)." + ) + .into() + })?; + info!("HTTPS reachable at https://{OPN_LAN_IP}:{OPN_API_PORT}"); + + info!("Verifying firewall SSH reachability post-run on {OPN_LAN_IP}:22..."); + let ssh_ok = tokio::time::timeout( + std::time::Duration::from_secs(30), + async { + loop { + if check_tcp_port(OPN_LAN_IP, 22).await { + return true; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + }, + ) + .await + .unwrap_or(false); + if !ssh_ok { + return Err(format!( + "Firewall SSH at {OPN_LAN_IP}:22 is unreachable after the Score pipeline. \ + HTTPS is up but sshd is bound to a stale interface — the detached configctl \ + chain after the LAN-bridge reassignment must include `configctl sshd restart` \ + so sshd re-binds to the new lan interface (bridge0)." + ) + .into()); + } + info!("SSH reachable at {OPN_LAN_IP}:22"); // Clean up temp files let _ = std::fs::remove_dir_all(std::env::temp_dir().join("harmony-tftp-test")); @@ -396,6 +462,7 @@ async fn run_integration() -> Result<(), Box> { println!("PASSED — All OPNsense integration tests successful:"); println!(" Run 1: all entities created correctly"); println!(" Run 2: idempotency verified — zero duplicates"); + println!(" Firewall reachable end-to-end after LAN-bridge reassignment"); println!(); println!("VM is running at {OPN_LAN_IP}. Use --clean to tear down."); Ok(()) @@ -415,6 +482,15 @@ struct StateSnapshot { vip_count: usize, dnat_rules: usize, lagg_count: usize, + bridge_count: usize, + bridge_sysctls: usize, + /// All `net.link.bridge.*` tunables with their current values + /// (post-Score). Lets the assertion check the four we care about + /// by key+value without disturbing other pre-existing entries. + bridge_sysctl_values: std::collections::HashMap, + tso_disabled: bool, + lro_disabled: bool, + lan_if: String, } impl StateSnapshot { @@ -429,6 +505,16 @@ impl StateSnapshot { info!(" VIPs: {}", self.vip_count); info!(" DNat rules: {}", self.dnat_rules); info!(" LAGGs: {}", self.lagg_count); + info!(" Bridges: {}", self.bridge_count); + info!( + " Bridge sysctls (net.link.bridge.*): {}", + self.bridge_sysctls + ); + info!( + " Hardware offload disabled: TSO={}, LRO={}", + self.tso_disabled, self.lro_disabled + ); + info!(" interfaces.lan.if: {}", self.lan_if); } fn assert_minimum_counts(&self) { @@ -472,6 +558,47 @@ impl StateSnapshot { "Expected >= 1 LAGG, got {}", self.lagg_count ); + assert!( + self.bridge_count >= 1, + "Expected >= 1 bridge, got {}", + self.bridge_count + ); + // The Score doesn't claim exclusive ownership of the + // `net.link.bridge.*` namespace; only that the 4 it cares + // about exist with the expected values. Pre-existing sysctls + // make the count >= 4 — that's fine. + assert!( + self.bridge_sysctls >= 4, + "Expected at least 4 net.link.bridge.* sysctls, got {}", + self.bridge_sysctls + ); + let expected: &[(&str, &str)] = &[ + ("net.link.bridge.pfil_member", "0"), + ("net.link.bridge.pfil_bridge", "1"), + ("net.link.bridge.pfil_local_phys", "0"), + ("net.link.bridge.inherit_mac", "1"), + ]; + for (key, want) in expected { + let got = self.bridge_sysctl_values.get(*key).map(String::as_str); + assert_eq!( + got, + Some(*want), + "Expected {key}={want}, got {got:?} from net.link.bridge.* tunables", + ); + } + assert!( + self.tso_disabled, + "Expected segmentation offloading to be disabled after OPNsenseLanBridgeScore", + ); + assert!( + self.lro_disabled, + "Expected large-receive offloading to be disabled after OPNsenseLanBridgeScore", + ); + assert_eq!( + self.lan_if, "bridge0", + "Expected to be reassigned to bridge0 (was {})", + self.lan_if + ); } } @@ -533,6 +660,51 @@ async fn verify_state( .map(|m| m.len()) .unwrap_or(0); + let bridges: serde_json::Value = client + .get_typed("interfaces", "bridge_settings", "get") + .await?; + let bridge_count = bridges["bridge"]["bridged"] + .as_object() + .map(|m| m.len()) + .unwrap_or(0); + + // Capture all net.link.bridge.* entries with their values so the + // assertion can check the four we care about by name + value while + // tolerating any extras left over from manual probing. + let tunables: serde_json::Value = client + .post_typed( + "core", + "tunables", + "searchItem", + Some(&serde_json::json!({ "searchPhrase": "net.link.bridge." })), + ) + .await?; + let bridge_sysctl_values: std::collections::HashMap = tunables["rows"] + .as_array() + .map(|rows| { + rows.iter() + .filter_map(|r| { + let tunable = r["tunable"].as_str()?; + if !tunable.starts_with("net.link.bridge.") { + return None; + } + let value = r["value"].as_str()?; + Some((tunable.to_string(), value.to_string())) + }) + .collect() + }) + .unwrap_or_default(); + let bridge_sysctls = bridge_sysctl_values.len(); + + let iface_settings: serde_json::Value = + client.get_typed("interfaces", "settings", "get").await?; + let tso_disabled = + iface_settings["settings"]["disablesegmentationoffloading"].as_str() == Some("1"); + let lro_disabled = + iface_settings["settings"]["disablelargereceiveoffloading"].as_str() == Some("1"); + + let lan_if = ssh_read_lan_if().await.unwrap_or_default(); + Ok(StateSnapshot { haproxy_frontends, dnsmasq_hosts, @@ -543,9 +715,44 @@ async fn verify_state( vip_count, dnat_rules, lagg_count, + bridge_count, + bridge_sysctls, + bridge_sysctl_values, + tso_disabled, + lro_disabled, + lan_if, }) } +/// Read `` over SSH (no REST endpoint for the legacy +/// interfaces tree). Returns empty string on failure — the +/// `assert_minimum_counts` check will then fail with a clear message. +async fn ssh_read_lan_if() -> Result> { + use opnsense_config::config::{OPNsenseShell, SshCredentials, SshOPNSenseShell}; + let ssh_creds = SecretManager::get::().await?; + let ip: std::net::IpAddr = OPN_LAN_IP.parse()?; + let ssh_config = std::sync::Arc::new(russh::client::Config { + inactivity_timeout: None, + ..<_>::default() + }); + let credentials = SshCredentials::Password { + username: ssh_creds.username.clone(), + password: ssh_creds.password.clone(), + }; + let shell = SshOPNSenseShell::new((ip, 22), credentials, ssh_config); + // Shell single-quotes preserve backslashes literally, so a SINGLE + // `\` in the Rust source reaches PHP as a single backslash and forms + // a valid namespace separator. `\\` in source would reach PHP as + // `\\` and trigger a parse error (silently empty stdout). + let out = shell + .exec( + "php -r 'require \"/usr/local/etc/inc/config.inc\"; \ + echo (string)OPNsense\\Core\\Config::getInstance()->object()->interfaces->lan->if;'", + ) + .await?; + Ok(out.trim().to_string()) +} + /// Build all test Scores — extracted so we can call it for both run 1 and run 2. fn build_all_scores() -> Result>>, Box> { let lb_score = LoadBalancerScore { @@ -707,6 +914,27 @@ fn build_all_scores() -> Result>>, Box` + // from vtnet0 to bridge0 — host-to-VM management survives because + // bridge0 inherits vtnet0's MAC (perf_tunables sets + // inherit_mac=1). Single-member is degenerate but exercises every + // code path; multi-member would need extra virtio NICs. + let lan_bridge_score = OPNsenseLanBridgeScore { + params: LanBridgeParams { + members: Some(vec!["vtnet0".to_string()]), + description: "harmony-test-lan-bridge".to_string(), + mtu: None, + enable_stp: false, + reassign_lan: true, + perf_tunables: true, + }, + }; + // WebGuiConfigScore runs first: moves webgui to 9443 so HAProxy can bind 443. // This is an explicit Score (not hidden in bootstrap) — see docs/architecture-challenges.md // for discussion of Score ordering/dependency. @@ -743,6 +971,7 @@ fn build_all_scores() -> Result>>, Box Date: Wed, 20 May 2026 09:05:46 -0400 Subject: [PATCH 33/38] feat(opnsense): reboot + verify reachability at end of OPNsenseBootstrapScore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Append a terminal step to `OPNsenseBootstrapScore.execute()`: POST `core/firmware/reboot` against the firewall's final address, then wait for the API to go unreachable, come back, and settle. Hard-fail if the firewall does not reappear at `final_ip:target_api_port`. The dance has touched firmware, the optional LAN bridge, the DHCP pool, and the LAN IP itself — a clean reboot guarantees the running kernel/config matches what was persisted, and the post-reboot probe makes reachability a contract the rest of the harmony pipeline can rely on instead of a best-effort warn-only check. Reuses the existing `wait_for_reboot_cycle` waiter from `firmware_upgrade.rs` (promoted to `pub`) and the same `core/firmware/reboot` POST shape that `perform_firmware_upgrade` uses mid-pipeline. Idempotency is unchanged: `decide()` still short-circuits to NOOP when creds exist + target is reachable + vanilla is gone, so a re-run does not trigger a second reboot. Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/opnsense/bootstrap.rs | 44 +++++++++++++++++++ .../src/modules/opnsense/bootstrap_score.rs | 32 +++++++++++++- .../src/modules/opnsense/firmware_upgrade.rs | 2 +- 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/harmony/src/modules/opnsense/bootstrap.rs b/harmony/src/modules/opnsense/bootstrap.rs index 94d2e655..12d7f742 100644 --- a/harmony/src/modules/opnsense/bootstrap.rs +++ b/harmony/src/modules/opnsense/bootstrap.rs @@ -616,6 +616,50 @@ pub async fn set_lan_dhcp_range_via_api( Ok(()) } +/// POST `core/firmware/reboot` and wait for the firewall to come back at +/// `final_ip:final_api_port`. +/// +/// Fire-and-forget POST — OPNsense tears down the TCP connection while +/// replying. Mirrors the reboot path inside `perform_firmware_upgrade`. +/// The waiter is shared with `firmware_upgrade::wait_for_reboot_cycle` +/// (unreachable-window probe → recovery probe → settle delay). +/// +/// Used as the terminal step of [`OPNsenseBootstrapScore`] to guarantee +/// the running state matches what was persisted after firmware upgrade, +/// optional LAN bridge, and optional LAN-IP rebind. +pub async fn reboot_and_verify_via_api( + final_ip: &str, + final_api_port: u16, + api_key: &str, + api_secret: &str, + tag: &str, +) -> Result<(), BootstrapError> { + let client = opnsense_api::OpnsenseClient::builder() + .base_url(format!("https://{final_ip}:{final_api_port}/api")) + .auth_from_key_secret(api_key, api_secret) + .skip_tls_verify() + .timeout_secs(60) + .build() + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "Failed to build OPNsense API client for terminal reboot: {e}" + )) + })?; + + info!("{tag} POST core/firmware/reboot ..."); + let _ = client + .post_typed::("core", "firmware", "reboot", None) + .await; + + super::firmware_upgrade::wait_for_reboot_cycle(final_ip, final_api_port, tag) + .await + .map_err(|e| { + BootstrapError::UnexpectedResponse(format!( + "Reboot triggered but firewall did not return cleanly: {e}" + )) + }) +} + /// Move the LAN interface to a new IP / subnet at runtime via SSH. /// /// SFTPs a PHP script that rewrites `interfaces.lan.ipaddr` and diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 2a0b96f9..4b460b4d 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -56,7 +56,8 @@ use crate::{ inventory::Inventory, modules::opnsense::bootstrap::{ DEFAULT_PHYSICAL_DRIVER_PREFIXES, OPNsenseBootstrap, change_lan_ip_via_ssh, - create_api_key_ssh, probe_https, set_lan_dhcp_range_via_api, + create_api_key_ssh, probe_https, reboot_and_verify_via_api, + set_lan_dhcp_range_via_api, }, modules::opnsense::firmware_upgrade::{FirmwareUpgradeMode, perform_firmware_upgrade}, modules::opnsense::lan_bridge::{LanBridgeParams, ensure_lan_bridge_step}, @@ -505,11 +506,37 @@ impl Interpret for OPNsenseBootstrapInterpret { } } - // ── Build the success Outcome (runbook-shaped details) ─────── + // ── Step 6: terminal reboot + verify ──────────────────────── + // The dance has touched firmware, the optional LAN bridge, the + // DHCP pool, and the LAN IP itself. A clean reboot guarantees + // the running kernel/config matches what was persisted. Hard + // fails if the firewall does not reappear at the expected + // address within the recovery window. let final_ip = match &self.score.target_lan { Some(rebind) => rebind.new_ip.to_string(), None => vanilla_ip.clone(), }; + info!( + "{tag} Step 6: rebooting and verifying https://{final_ip}:{} comes back ...", + self.score.target_api_port + ); + reboot_and_verify_via_api( + &final_ip, + self.score.target_api_port, + &key, + &secret, + &tag, + ) + .await + .map_err(|e| { + InterpretError::new(format!( + "Persisted credentials and applied all config changes, but the final \ + reboot/verify step failed: {e}. On-disk firewall state should be \ + correct — investigate and reboot manually if needed." + )) + })?; + + // ── Build the success Outcome (runbook-shaped details) ─────── let lan_line = match &self.score.target_lan { Some(rebind) => format!( " Final IP: {}/{} (LAN rebind applied)", @@ -530,6 +557,7 @@ impl Interpret for OPNsenseBootstrapInterpret { format!(" SSH: {}@{final_ip}", topology.default_username), " API creds: stored as OPNSenseApiCredentials in SecretManager".to_string(), " SSH creds: stored as OPNSenseFirewallCredentials in SecretManager".to_string(), + " Reboot: triggered and reachability verified at the final address".to_string(), ]; if self.score.target_lan.is_some() { details.push(String::new()); diff --git a/harmony/src/modules/opnsense/firmware_upgrade.rs b/harmony/src/modules/opnsense/firmware_upgrade.rs index c7e00ea1..de14eaf9 100644 --- a/harmony/src/modules/opnsense/firmware_upgrade.rs +++ b/harmony/src/modules/opnsense/firmware_upgrade.rs @@ -712,7 +712,7 @@ async fn wait_for_task_or_reboot( /// Wait for the firewall to go unreachable, come back, and settle. /// /// `firewall_ip` / `api_port` describe where the API should re-appear. -async fn wait_for_reboot_cycle( +pub async fn wait_for_reboot_cycle( firewall_ip: &str, api_port: u16, tag: &str, -- 2.39.5 From d0fe5802d03a7d05bc9a137e42a98687b8af8007 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 10:23:51 -0400 Subject: [PATCH 34/38] feat(opnsense): pause for operator network reconnect between LAN rebind and reboot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LAN rebind step moves the firewall to a new subnet, severing the dev machine's connection. The terminal reboot step that immediately followed would then time out trying to POST `core/firmware/reboot` from a machine no longer on the firewall's subnet. Insert a blocking `inquire::Confirm` prompt between step 5 and step 6 (only when `target_lan` is `Some(_)`) that: - tells the operator the new firewall address and prefix, - prompts them to renew DHCP or set a static IP in the new subnet, - waits for explicit confirmation before continuing. Declining returns `InterpretError` so the bootstrap fails loudly in a clearly-recoverable mid-state — re-running after reconnection picks up at the reboot step. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/opnsense/bootstrap_score.rs | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/harmony/src/modules/opnsense/bootstrap_score.rs b/harmony/src/modules/opnsense/bootstrap_score.rs index 4b460b4d..d1bbd45c 100644 --- a/harmony/src/modules/opnsense/bootstrap_score.rs +++ b/harmony/src/modules/opnsense/bootstrap_score.rs @@ -506,6 +506,50 @@ impl Interpret for OPNsenseBootstrapInterpret { } } + // ── Step 5.5: pause for operator network reconnect ────────── + // The LAN rebind above severed the dev machine's connection to + // the firewall. The terminal reboot below needs the firewall + // reachable from this process. Pause and ask the operator to + // reconnect into the new subnet before proceeding. + if let Some(rebind) = &self.score.target_lan { + let new_addr = format!( + "https://{}:{}", + rebind.new_ip, self.score.target_api_port + ); + println!(); + println!("───────────────────────────────────────────────────────────"); + println!(" LAN rebind applied. The firewall is now at {new_addr}."); + println!(" Your machine is no longer on its subnet."); + println!(); + println!(" → Reconnect to the new LAN now:"); + println!(" • renew DHCP, or"); + println!( + " • set a static address in {}/{}.", + rebind.new_ip, rebind.prefix + ); + println!(); + println!(" Once your machine can reach {new_addr}, confirm below"); + println!(" to trigger the final reboot + verify step."); + println!("───────────────────────────────────────────────────────────"); + + let proceed = inquire::Confirm::new("Continue with the reboot?") + .with_default(true) + .prompt() + .map_err(|e| { + InterpretError::new(format!( + "Failed to read confirmation prompt: {e}. Re-run the Score \ + to retry (the dance will resume at the reboot step)." + )) + })?; + if !proceed { + return Err(InterpretError::new(format!( + "Aborted by operator after LAN rebind. The firewall is at \ + {new_addr} but has not been rebooted yet. Re-run the Score \ + after reconnecting to the new LAN to finish the bootstrap." + ))); + } + } + // ── Step 6: terminal reboot + verify ──────────────────────── // The dance has touched firmware, the optional LAN bridge, the // DHCP pool, and the LAN IP itself. A clean reboot guarantees -- 2.39.5 From a9baa4f15d4d7f98f9106cd51dc6fbc905f9911e Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Wed, 20 May 2026 10:44:01 -0400 Subject: [PATCH 35/38] perf(opnsense-config): bump SFTP upload chunk size to 256 KB The previous `FramedRead` loop in `upload_folder` defaulted to ~8 KB reads, each turning into its own SFTP WRITE round-trip. Replace it with an explicit 256 KB chunked read+write_all loop. Same correctness, fewer awaits, fewer protocol packets per file. Drops the unused `tokio_stream` and `tokio_util::codec` imports. Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-config/src/config/shell/ssh.rs | 30 ++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/opnsense-config/src/config/shell/ssh.rs b/opnsense-config/src/config/shell/ssh.rs index 1f82c636..52120b2f 100644 --- a/opnsense-config/src/config/shell/ssh.rs +++ b/opnsense-config/src/config/shell/ssh.rs @@ -4,7 +4,6 @@ use std::{ sync::Arc, time::{SystemTime, UNIX_EPOCH}, }; -use tokio_stream::StreamExt; use async_trait::async_trait; use log::{debug, info, trace}; @@ -14,14 +13,19 @@ use russh::{ }; use russh_keys::key; use russh_sftp::client::SftpSession; -use tokio::io::AsyncWriteExt; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use crate::{config::SshCredentials, Error}; use super::OPNsenseShell; use tokio::fs::read_dir; use tokio::fs::File; -use tokio_util::codec::{BytesCodec, FramedRead}; + +/// Local read buffer for SFTP uploads. The old `FramedRead<_, BytesCodec>` +/// path defaulted to ~8 KB chunks; each chunk became its own SFTP WRITE +/// round-trip. 256 KB collapses that to a fraction of the awaits and lets +/// `write_all` amortize over multiple in-flight protocol packets. +const UPLOAD_CHUNK_SIZE: usize = 256 * 1024; #[derive(Debug)] pub struct SshOPNSenseShell { @@ -110,18 +114,14 @@ impl OPNsenseShell for SshOPNSenseShell { let mut remote_file = sftp.create(remote_path.as_str()).await?; debug!("Writing file {remote_path:?}"); - let local_file = File::open(&local_path).await?; - let mut reader = FramedRead::new(local_file, BytesCodec::new()); - - while let Some(result) = reader.next().await { - match result { - Ok(bytes) => { - if !bytes.is_empty() { - AsyncWriteExt::write_all(&mut remote_file, &bytes).await?; - } - } - Err(e) => todo!("Error unhandled {e}"), - }; + let mut local_file = File::open(&local_path).await?; + let mut buf = vec![0u8; UPLOAD_CHUNK_SIZE]; + loop { + let n = local_file.read(&mut buf).await?; + if n == 0 { + break; + } + AsyncWriteExt::write_all(&mut remote_file, &buf[..n]).await?; } } else if entry.file_type().await?.is_dir() { let sub_source = entry.path(); -- 2.39.5 From 28e6755d5fc4154da714989bb45772a7bd24405a Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Thu, 21 May 2026 14:18:20 -0400 Subject: [PATCH 36/38] =?UTF-8?q?feat(okd):=20OKDReapplyDhcpBindingsScore?= =?UTF-8?q?=20=E2=80=94=20re-apply=20DHCP=20from=20inventory=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recovery Score for the case where the OPNsense firewall has been reinstalled but the harmony inventory database still holds the discovered physical hosts. Looks up DB hosts per role, zips them with HAClusterTopology slots, runs DhcpHostBindingScore to re-create the dnsmasq Host entries (DHCP reservation + A record) without doing network discovery, iPXE, or reboot work. Interactive via inquire multi-select (default) or explicit role list via for_roles() / all_roles(). Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/okd/mod.rs | 2 + harmony/src/modules/okd/reapply_dhcp.rs | 210 ++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 harmony/src/modules/okd/reapply_dhcp.rs diff --git a/harmony/src/modules/okd/mod.rs b/harmony/src/modules/okd/mod.rs index 6fd48e7a..fd52d50b 100644 --- a/harmony/src/modules/okd/mod.rs +++ b/harmony/src/modules/okd/mod.rs @@ -30,7 +30,9 @@ pub mod disable_dad_score; pub mod host_network; pub mod node_file_score; pub mod os_artifacts; +pub mod reapply_dhcp; pub mod system_reserved_score; pub use add_node::*; pub use os_artifacts::*; +pub use reapply_dhcp::*; diff --git a/harmony/src/modules/okd/reapply_dhcp.rs b/harmony/src/modules/okd/reapply_dhcp.rs new file mode 100644 index 00000000..269642ad --- /dev/null +++ b/harmony/src/modules/okd/reapply_dhcp.rs @@ -0,0 +1,210 @@ +//! Re-apply DHCP host bindings for already-discovered nodes. +//! +//! Recovery-time tool: when the firewall is reinstalled / re-bootstrapped +//! and the dnsmasq host table is empty, but the harmony inventory database +//! still has the discovered physical hosts, this Score re-writes the +//! DHCP/dnsmasq entries from the DB without doing any network discovery +//! or PXE/reboot work. +//! +//! Pick which roles to re-apply via: +//! - `OKDReapplyDhcpBindingsScore::interactive()` — prompts via inquire +//! - `OKDReapplyDhcpBindingsScore::for_roles(vec![...])` — explicit set +//! - `OKDReapplyDhcpBindingsScore::all_roles()` — bootstrap + CP + worker +//! +//! Logic per role: +//! 1. Look up persisted hosts for the role in the inventory DB. +//! 2. Zip them with the matching `HAClusterTopology` slot(s) +//! (`bootstrap_host`, `control_plane`, `workers`). +//! 3. Run `DhcpHostBindingScore` to write `add_static_mapping` entries +//! via the topology's `DhcpServer` (OPNsense → dnsmasq). +//! +//! Skips roles with no DB hosts. Errors when DB count and topology slot +//! count diverge for a role the user explicitly asked for. + +use async_trait::async_trait; +use harmony_types::id::Id; +use log::{info, warn}; +use serde::Serialize; + +use crate::{ + data::Version, + hardware::PhysicalHost, + infra::inventory::InventoryRepositoryFactory, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::{HostRole, Inventory}, + modules::dhcp::DhcpHostBindingScore, + score::Score, + topology::{HAClusterTopology, HostBinding, HostConfig, LogicalHost}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct OKDReapplyDhcpBindingsScore { + /// Which roles to re-apply DHCP bindings for. `None` triggers an + /// interactive multi-select prompt at execute time. + pub roles: Option>, +} + +impl OKDReapplyDhcpBindingsScore { + pub fn interactive() -> Self { + Self { roles: None } + } + + pub fn for_roles(roles: Vec) -> Self { + Self { roles: Some(roles) } + } + + pub fn all_roles() -> Self { + Self { + roles: Some(vec![ + HostRole::Bootstrap, + HostRole::ControlPlane, + HostRole::Worker, + ]), + } + } +} + +impl Score for OKDReapplyDhcpBindingsScore { + fn create_interpret(&self) -> Box> { + Box::new(OKDReapplyDhcpBindingsInterpret { + score: self.clone(), + }) + } + + fn name(&self) -> String { + "OKDReapplyDhcpBindingsScore".to_string() + } +} + +#[derive(Debug)] +struct OKDReapplyDhcpBindingsInterpret { + score: OKDReapplyDhcpBindingsScore, +} + +#[async_trait] +impl Interpret for OKDReapplyDhcpBindingsInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &HAClusterTopology, + ) -> Result { + let roles = match &self.score.roles { + Some(r) => r.clone(), + None => prompt_roles()?, + }; + + if roles.is_empty() { + return Ok(Outcome::success("No roles selected; nothing to do".into())); + } + + let repo = InventoryRepositoryFactory::build().await?; + let mut details: Vec = Vec::new(); + + for role in roles { + let hosts = repo.get_hosts_for_role(&role).await?; + let logical = role_logical_hosts(&role, topology); + + if hosts.is_empty() { + warn!("[{role}] no hosts in inventory DB, skipping"); + details.push(format!("[{role}] skipped (no DB hosts)")); + continue; + } + if logical.is_empty() { + warn!("[{role}] topology has no slot for this role, skipping"); + details.push(format!("[{role}] skipped (no topology slot)")); + continue; + } + if logical.len() != hosts.len() { + return Err(InterpretError::new(format!( + "[{role}] topology defines {} logical host(s) but inventory DB has {} \ + physical — refusing to re-apply with a mismatched count", + logical.len(), + hosts.len() + ))); + } + + let bindings = build_bindings(&hosts, &logical); + info!( + "[{role}] re-applying {} DHCP binding(s) from inventory DB", + bindings.len() + ); + + DhcpHostBindingScore { + host_binding: bindings, + domain: Some(topology.domain_name.clone()), + } + .create_interpret() + .execute(inventory, topology) + .await?; + + details.push(format!("[{role}] re-applied {} binding(s)", hosts.len())); + } + + Ok(Outcome::success_with_details( + "DHCP bindings re-applied from inventory database".to_string(), + details, + )) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OKDReapplyDhcpBindings".into()) + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +fn role_logical_hosts(role: &HostRole, t: &HAClusterTopology) -> Vec { + match role { + HostRole::Bootstrap => vec![t.bootstrap_host.clone()], + HostRole::ControlPlane => t.control_plane.clone(), + HostRole::Worker => t.workers.clone(), + } +} + +fn build_bindings( + nodes: &[(PhysicalHost, HostConfig)], + hosts: &[LogicalHost], +) -> Vec { + hosts + .iter() + .zip(nodes.iter()) + .map(|(logical, (physical, host_config))| HostBinding { + logical_host: logical.clone(), + physical_host: physical.clone(), + host_config: host_config.clone(), + }) + .collect() +} + +fn prompt_roles() -> Result, InterpretError> { + let options = vec![ + HostRole::Bootstrap, + HostRole::ControlPlane, + HostRole::Worker, + ]; + let labels: Vec = options.iter().map(|r| r.to_string()).collect(); + + let chosen = inquire::MultiSelect::new( + "Which host roles should have their DHCP bindings re-applied from the inventory DB?", + labels.clone(), + ) + .prompt() + .map_err(|e| InterpretError::new(format!("interactive role prompt failed: {e}")))?; + + Ok(options + .into_iter() + .zip(labels) + .filter(|(_, label)| chosen.contains(label)) + .map(|(role, _)| role) + .collect()) +} -- 2.39.5 From a283f9238818ff63421c180da7aceb8b5a4d9a27 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Thu, 21 May 2026 14:47:55 -0400 Subject: [PATCH 37/38] feat(okd): extend recovery Score to also re-create byMAC iPXE files Renames OKDReapplyDhcpBindingsScore to OKDReapplyFromInventoryScore (file: reapply_from_inventory.rs) to reflect the broader scope. Per selected role, the Score now: 1. Re-writes dnsmasq Host entries via DhcpHostBindingScore (existing behavior). 2. Re-creates byMAC/01-.ipxe boot files via IPxeMacBootFileScore, rendering BootstrapIpxeTpl with the role-appropriate ignition file (bootstrap.ign / master.ign / worker.ign). Pulls installation_device + MAC from each host's PhysicalHost + HostConfig row in the inventory DB; errors loudly if either is missing rather than silently producing a half-written byMAC tree. Same constructors (interactive / for_roles / all_roles) and same inquire multi-select UX, with the prompt wording broadened from "DHCP bindings" to "firewall config". Co-Authored-By: Claude Opus 4.7 (1M context) --- harmony/src/modules/okd/mod.rs | 4 +- ...pply_dhcp.rs => reapply_from_inventory.rs} | 128 +++++++++++++----- 2 files changed, 95 insertions(+), 37 deletions(-) rename harmony/src/modules/okd/{reapply_dhcp.rs => reapply_from_inventory.rs} (52%) diff --git a/harmony/src/modules/okd/mod.rs b/harmony/src/modules/okd/mod.rs index fd52d50b..6a5ecc2f 100644 --- a/harmony/src/modules/okd/mod.rs +++ b/harmony/src/modules/okd/mod.rs @@ -30,9 +30,9 @@ pub mod disable_dad_score; pub mod host_network; pub mod node_file_score; pub mod os_artifacts; -pub mod reapply_dhcp; +pub mod reapply_from_inventory; pub mod system_reserved_score; pub use add_node::*; pub use os_artifacts::*; -pub use reapply_dhcp::*; +pub use reapply_from_inventory::*; diff --git a/harmony/src/modules/okd/reapply_dhcp.rs b/harmony/src/modules/okd/reapply_from_inventory.rs similarity index 52% rename from harmony/src/modules/okd/reapply_dhcp.rs rename to harmony/src/modules/okd/reapply_from_inventory.rs index 269642ad..42a2ab57 100644 --- a/harmony/src/modules/okd/reapply_dhcp.rs +++ b/harmony/src/modules/okd/reapply_from_inventory.rs @@ -1,25 +1,27 @@ -//! Re-apply DHCP host bindings for already-discovered nodes. +//! Re-apply firewall config for already-discovered nodes. //! -//! Recovery-time tool: when the firewall is reinstalled / re-bootstrapped -//! and the dnsmasq host table is empty, but the harmony inventory database -//! still has the discovered physical hosts, this Score re-writes the -//! DHCP/dnsmasq entries from the DB without doing any network discovery -//! or PXE/reboot work. +//! Recovery tool: when the OPNsense firewall has been reinstalled but the +//! harmony inventory database still has the discovered physical hosts, +//! this Score re-creates the bits that live on the firewall — without +//! running discovery, prompting for reboot, or otherwise touching the +//! installed cluster. +//! +//! What it (re-)writes per selected role: +//! 1. dnsmasq Host entries (DHCP reservation + A record), via +//! `DhcpHostBindingScore` → `DhcpConfigDnsMasq::add_static_mapping`. +//! 2. Per-MAC iPXE boot files (`byMAC/01-.ipxe`) served over +//! HTTP, via `IPxeMacBootFileScore`. Uses the same `BootstrapIpxeTpl` +//! stages 02/03/04 use, parameterized by the role's ignition file +//! (`bootstrap.ign` / `master.ign` / `worker.ign`). //! //! Pick which roles to re-apply via: -//! - `OKDReapplyDhcpBindingsScore::interactive()` — prompts via inquire -//! - `OKDReapplyDhcpBindingsScore::for_roles(vec![...])` — explicit set -//! - `OKDReapplyDhcpBindingsScore::all_roles()` — bootstrap + CP + worker -//! -//! Logic per role: -//! 1. Look up persisted hosts for the role in the inventory DB. -//! 2. Zip them with the matching `HAClusterTopology` slot(s) -//! (`bootstrap_host`, `control_plane`, `workers`). -//! 3. Run `DhcpHostBindingScore` to write `add_static_mapping` entries -//! via the topology's `DhcpServer` (OPNsense → dnsmasq). +//! - `OKDReapplyFromInventoryScore::interactive()` — prompts via inquire +//! - `OKDReapplyFromInventoryScore::for_roles(vec![...])` — explicit set +//! - `OKDReapplyFromInventoryScore::all_roles()` — bootstrap + CP + worker //! //! Skips roles with no DB hosts. Errors when DB count and topology slot -//! count diverge for a role the user explicitly asked for. +//! count diverge for a role the user explicitly asked for, or when a +//! host has no installation_device / MAC recorded in the DB. use async_trait::async_trait; use harmony_types::id::Id; @@ -32,19 +34,22 @@ use crate::{ infra::inventory::InventoryRepositoryFactory, interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, inventory::{HostRole, Inventory}, - modules::dhcp::DhcpHostBindingScore, + modules::{ + dhcp::DhcpHostBindingScore, http::IPxeMacBootFileScore, + okd::templates::BootstrapIpxeTpl, + }, score::Score, topology::{HAClusterTopology, HostBinding, HostConfig, LogicalHost}, }; #[derive(Debug, Clone, Serialize)] -pub struct OKDReapplyDhcpBindingsScore { - /// Which roles to re-apply DHCP bindings for. `None` triggers an - /// interactive multi-select prompt at execute time. +pub struct OKDReapplyFromInventoryScore { + /// Which roles to re-apply for. `None` triggers an interactive + /// multi-select prompt at execute time. pub roles: Option>, } -impl OKDReapplyDhcpBindingsScore { +impl OKDReapplyFromInventoryScore { pub fn interactive() -> Self { Self { roles: None } } @@ -64,25 +69,25 @@ impl OKDReapplyDhcpBindingsScore { } } -impl Score for OKDReapplyDhcpBindingsScore { +impl Score for OKDReapplyFromInventoryScore { fn create_interpret(&self) -> Box> { - Box::new(OKDReapplyDhcpBindingsInterpret { + Box::new(OKDReapplyFromInventoryInterpret { score: self.clone(), }) } fn name(&self) -> String { - "OKDReapplyDhcpBindingsScore".to_string() + "OKDReapplyFromInventoryScore".to_string() } } #[derive(Debug)] -struct OKDReapplyDhcpBindingsInterpret { - score: OKDReapplyDhcpBindingsScore, +struct OKDReapplyFromInventoryInterpret { + score: OKDReapplyFromInventoryScore, } #[async_trait] -impl Interpret for OKDReapplyDhcpBindingsInterpret { +impl Interpret for OKDReapplyFromInventoryInterpret { async fn execute( &self, inventory: &Inventory, @@ -99,6 +104,7 @@ impl Interpret for OKDReapplyDhcpBindingsInterpret { let repo = InventoryRepositoryFactory::build().await?; let mut details: Vec = Vec::new(); + let http_ip = topology.http_server.get_ip().to_string(); for role in roles { let hosts = repo.get_hosts_for_role(&role).await?; @@ -123,31 +129,75 @@ impl Interpret for OKDReapplyDhcpBindingsInterpret { ))); } + // 1. DHCP / dnsmasq host entries (DHCP reservation + A record). let bindings = build_bindings(&hosts, &logical); info!( "[{role}] re-applying {} DHCP binding(s) from inventory DB", bindings.len() ); - DhcpHostBindingScore { host_binding: bindings, domain: Some(topology.domain_name.clone()), } - .create_interpret() - .execute(inventory, topology) + .interpret(inventory, topology) .await?; - details.push(format!("[{role}] re-applied {} binding(s)", hosts.len())); + // 2. Per-MAC iPXE boot files (byMAC/01-.ipxe over HTTP). + let ignition_file_name = role_ignition_file(&role); + for (physical, host_config) in &hosts { + let installation_device = + host_config.installation_device.as_deref().ok_or_else(|| { + InterpretError::new(format!( + "[{role}] host {} has no installation_device in DB; \ + cannot render iPXE template", + physical.summary() + )) + })?; + + let content = BootstrapIpxeTpl { + http_ip: &http_ip, + scos_path: "scos", + ignition_http_path: "okd_ignition_files", + installation_device, + ignition_file_name, + } + .to_string(); + + let mac_address = physical.get_mac_address(); + if mac_address.is_empty() { + return Err(InterpretError::new(format!( + "[{role}] host {} has no MAC in DB; cannot write byMAC file", + physical.summary() + ))); + } + + IPxeMacBootFileScore { + mac_address, + content, + } + .interpret(inventory, topology) + .await?; + } + info!( + "[{role}] re-applied {} byMAC iPXE file(s) from inventory DB", + hosts.len() + ); + + details.push(format!( + "[{role}] re-applied {} DHCP binding(s) + {} byMAC iPXE file(s)", + hosts.len(), + hosts.len() + )); } Ok(Outcome::success_with_details( - "DHCP bindings re-applied from inventory database".to_string(), + "Firewall config re-applied from inventory database".to_string(), details, )) } fn get_name(&self) -> InterpretName { - InterpretName::Custom("OKDReapplyDhcpBindings".into()) + InterpretName::Custom("OKDReapplyFromInventory".into()) } fn get_version(&self) -> Version { @@ -171,6 +221,14 @@ fn role_logical_hosts(role: &HostRole, t: &HAClusterTopology) -> Vec &'static str { + match role { + HostRole::Bootstrap => "bootstrap.ign", + HostRole::ControlPlane => "master.ign", + HostRole::Worker => "worker.ign", + } +} + fn build_bindings( nodes: &[(PhysicalHost, HostConfig)], hosts: &[LogicalHost], @@ -195,7 +253,7 @@ fn prompt_roles() -> Result, InterpretError> { let labels: Vec = options.iter().map(|r| r.to_string()).collect(); let chosen = inquire::MultiSelect::new( - "Which host roles should have their DHCP bindings re-applied from the inventory DB?", + "Which host roles should have their firewall config re-applied from the inventory DB?", labels.clone(), ) .prompt() -- 2.39.5 From f2ab47de6e221106ed3dcff7f609a2e0baca9aef Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Thu, 21 May 2026 15:03:10 -0400 Subject: [PATCH 38/38] fix(opnsense-config): preserve symlinks in upload_folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `tokio::fs::DirEntry::file_type()` does not follow symlinks, so the existing is_file()/is_dir() branches both returned false for them and the loop silently skipped every symlink. Caller-visible consequence: uploading ./data/okd/installer_image/ (three versioned SCOS files + three stable-name symlinks pointing at them) ended up with only the versioned files on the firewall under /usr/local/http/scos/. The byMAC iPXE files chainload via the stable names (scos-live-kernel.x86_64 etc.), so PXE boots dangled on a 404 until an operator created the symlinks by hand. Adds a third is_symlink() branch that reads the link target with tokio::fs::read_link and recreates it remotely via `ln -sfn` over SSH. `ln` rather than SFTP SSH_FXP_SYMLINK because OpenSSH's server inverts the (path, target) argument order versus the spec — `ln` is unambiguous across shells (including the firewall's tcsh). Co-Authored-By: Claude Opus 4.7 (1M context) --- opnsense-config/src/config/shell/ssh.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/opnsense-config/src/config/shell/ssh.rs b/opnsense-config/src/config/shell/ssh.rs index 52120b2f..3414256f 100644 --- a/opnsense-config/src/config/shell/ssh.rs +++ b/opnsense-config/src/config/shell/ssh.rs @@ -129,6 +129,28 @@ impl OPNsenseShell for SshOPNSenseShell { format!("{}/{}", destination, entry.file_name().to_string_lossy()); self.upload_folder(sub_source.to_str().unwrap(), &sub_destination) .await?; + } else if entry.file_type().await?.is_symlink() { + // SFTP `create()` would dereference + copy the target, losing + // the link semantics; we instead recreate the symlink on the + // remote. Use `ln -sfn` over SSH rather than the SFTP + // SSH_FXP_SYMLINK opcode — its (path, target) argument order + // is inverted between OpenSSH server and the protocol spec, + // and `ln` has unambiguous semantics across shells. + let local_path = entry.path(); + let target = tokio::fs::read_link(&local_path).await?; + let target_str = target.to_string_lossy().to_string(); + let file_name = local_path + .file_name() + .expect("symlink entry must have a name") + .to_string_lossy(); + let remote_path = format!("{}/{}", destination, file_name); + info!("Creating remote symlink {remote_path} -> {target_str}"); + let cmd = format!( + "ln -sfn '{}' '{}'", + target_str.replace('\'', r"'\''"), + remote_path.replace('\'', r"'\''"), + ); + self.run_command(&cmd).await?; } } -- 2.39.5