wip: OKD Installation full process automation underway, ready to test bootstrapping very soon
Some checks failed
Run Check Script / check (pull_request) Failing after 30s

This commit is contained in:
Jean-Gabriel Gill-Couture 2025-09-01 23:20:38 -04:00
parent f076d36297
commit 35a459f63c
19 changed files with 1922 additions and 147 deletions

1
Cargo.lock generated
View File

@ -2285,6 +2285,7 @@ dependencies = [
"helm-wrapper-rs",
"hex",
"http 1.3.1",
"inquire",
"k3d-rs",
"k8s-openapi",
"kube",

View File

@ -2,7 +2,7 @@ use harmony::{
inventory::Inventory,
modules::{
dummy::{ErrorScore, PanicScore, SuccessScore},
inventory::DiscoverInventoryAgentScore,
inventory::LaunchDiscoverInventoryAgentScore,
},
topology::LocalhostTopology,
};
@ -16,7 +16,7 @@ async fn main() {
Box::new(SuccessScore {}),
Box::new(ErrorScore {}),
Box::new(PanicScore {}),
Box::new(DiscoverInventoryAgentScore {
Box::new(LaunchDiscoverInventoryAgentScore {
discovery_timeout: Some(10),
}),
],

View File

@ -5,12 +5,11 @@ use std::{
use cidr::Ipv4Cidr;
use harmony::{
hardware::{FirewallGroup, HostCategory, Location, PhysicalHost, SwitchGroup},
hardware::{Location, PhysicalHost, SwitchGroup},
infra::opnsense::OPNSenseManagementInterface,
inventory::Inventory,
modules::{
http::StaticFilesHttpScore,
ipxe::IpxeScore,
okd::{
bootstrap_dhcp::OKDBootstrapDhcpScore,
bootstrap_load_balancer::OKDBootstrapLoadBalancerScore, dhcp::OKDDhcpScore,

View File

@ -70,6 +70,7 @@ harmony_inventory_agent = { path = "../harmony_inventory_agent" }
harmony_secret_derive = { version = "0.1.0", path = "../harmony_secret_derive" }
askama.workspace = true
sqlx.workspace = true
inquire.workspace = true
[dev-dependencies]
pretty_assertions.workspace = true

View File

@ -61,3 +61,10 @@ impl Inventory {
}
}
}
pub enum HostRole {
Bootstrap,
ControlPlane,
Worker,
Storage,
}

View File

@ -1,6 +1,6 @@
use async_trait::async_trait;
use crate::hardware::PhysicalHost;
use crate::{hardware::PhysicalHost, interpret::InterpretError, inventory::HostRole};
/// Errors that can occur within the repository layer.
#[derive(thiserror::Error, Debug)]
@ -15,6 +15,12 @@ pub enum RepoError {
ConnectionFailed(String),
}
impl From<RepoError> for InterpretError {
fn from(value: RepoError) -> Self {
InterpretError::new(format!("Interpret error : {value}"))
}
}
// --- Trait and Implementation ---
/// Defines the contract for inventory persistence.
@ -22,4 +28,10 @@ pub enum RepoError {
pub trait InventoryRepository: Send + Sync + 'static {
async fn save(&self, host: &PhysicalHost) -> Result<(), RepoError>;
async fn get_latest_by_id(&self, host_id: &str) -> Result<Option<PhysicalHost>, RepoError>;
async fn get_all_hosts(&self) -> Result<Vec<PhysicalHost>, RepoError>;
async fn save_role_mapping(
&self,
role: &HostRole,
host: &PhysicalHost,
) -> Result<(), RepoError>;
}

View File

@ -161,6 +161,14 @@ impl DhcpServer for HAClusterTopology {
self.dhcp_server.set_pxe_options(options).await
}
async fn set_dhcp_range(
&self,
start: &IpAddress,
end: &IpAddress,
) -> Result<(), ExecutorError> {
self.dhcp_server.set_dhcp_range(start, end).await
}
fn get_ip(&self) -> IpAddress {
self.dhcp_server.get_ip()
}
@ -298,6 +306,13 @@ impl DhcpServer for DummyInfra {
async fn set_pxe_options(&self, _options: PxeOptions) -> Result<(), ExecutorError> {
unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA)
}
async fn set_dhcp_range(
&self,
start: &IpAddress,
end: &IpAddress,
) -> Result<(), ExecutorError> {
unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA)
}
fn get_ip(&self) -> IpAddress {
unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA)
}

View File

@ -59,6 +59,8 @@ pub trait DhcpServer: Send + Sync + std::fmt::Debug {
async fn remove_static_mapping(&self, mac: &MacAddress) -> Result<(), ExecutorError>;
async fn list_static_mappings(&self) -> Vec<(MacAddress, IpAddress)>;
async fn set_pxe_options(&self, pxe_options: PxeOptions) -> Result<(), ExecutorError>;
async fn set_dhcp_range(&self, start: &IpAddress, end: &IpAddress)
-> Result<(), ExecutorError>;
fn get_ip(&self) -> IpAddress;
fn get_host(&self) -> LogicalHost;
async fn commit_config(&self) -> Result<(), ExecutorError>;

View File

@ -68,4 +68,19 @@ impl DhcpServer for OPNSenseFirewall {
ExecutorError::UnexpectedError(format!("Failed to set_pxe_options : {dhcp_error}"))
})
}
async fn set_dhcp_range(
&self,
start: &IpAddress,
end: &IpAddress,
) -> Result<(), ExecutorError> {
let mut writable_opnsense = self.opnsense_config.write().await;
writable_opnsense
.dhcp()
.set_dhcp_range(&start.to_string(), &end.to_string())
.await
.map_err(|dhcp_error| {
ExecutorError::UnexpectedError(format!("Failed to set_dhcp_range : {dhcp_error}"))
})
}
}

View File

@ -22,6 +22,7 @@ pub struct DhcpScore {
pub filename: Option<String>,
pub filename64: Option<String>,
pub filenameipxe: Option<String>,
pub dhcp_range: (IpAddress, IpAddress),
}
impl<T: Topology + DhcpServer> Score<T> for DhcpScore {
@ -58,9 +59,6 @@ impl DhcpInterpret {
_inventory: &Inventory,
dhcp_server: &D,
) -> Result<Outcome, InterpretError> {
todo!(
"I don't think this set_pxe_options function still works since the major dnsmasq refactoring. It certainly is not far off, but we use the dedicated okd ipxe score now. They should work together, this needs refactoring."
);
let pxe_options = PxeOptions {
ipxe_filename: self.score.filenameipxe.clone().unwrap_or_default(),
bios_filename: self.score.filename.clone().unwrap_or_default(),
@ -110,6 +108,9 @@ impl<T: Topology + DhcpServer> Interpret<T> for DhcpInterpret {
info!("Executing DhcpInterpret on inventory {inventory:?}");
self.set_pxe_options(inventory, topology).await?;
topology
.set_dhcp_range(&self.score.dhcp_range.0, &self.score.dhcp_range.1)
.await?;
DhcpHostBindingScore {
host_binding: self.score.host_binding.clone(),
@ -146,7 +147,7 @@ impl<T: Topology + DhcpServer> Score<T> for DhcpHostBindingScore {
// https://docs.opnsense.org/manual/dhcp.html#advanced-settings
#[derive(Debug, Clone)]
pub struct DhcpHostBindingInterpret {
score: DhcpScore,
score: DhcpHostBindingScore,
}
impl DhcpHostBindingInterpret {

View File

@ -18,11 +18,11 @@ use harmony_types::id::Id;
/// This will allow us to register/update hosts running harmony_inventory_agent
/// from LAN in the Harmony inventory
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiscoverInventoryAgentScore {
pub struct LaunchDiscoverInventoryAgentScore {
pub discovery_timeout: Option<u64>,
}
impl<T: Topology> Score<T> for DiscoverInventoryAgentScore {
impl<T: Topology> Score<T> for LaunchDiscoverInventoryAgentScore {
fn name(&self) -> String {
"DiscoverInventoryAgentScore".to_string()
}
@ -36,7 +36,7 @@ impl<T: Topology> Score<T> for DiscoverInventoryAgentScore {
#[derive(Debug)]
struct DiscoverInventoryAgentInterpret {
score: DiscoverInventoryAgentScore,
score: LaunchDiscoverInventoryAgentScore,
}
#[async_trait]
@ -46,6 +46,13 @@ impl<T: Topology> Interpret<T> for DiscoverInventoryAgentInterpret {
_inventory: &Inventory,
_topology: &T,
) -> Result<Outcome, InterpretError> {
match self.score.discovery_timeout {
Some(timeout) => info!("Discovery agent will wait for {timeout} seconds"),
None => info!(
"Discovery agent will wait forever in the background, go on and enjoy this delicious inventory."
),
};
harmony_inventory_agent::local_presence::discover_agents(
self.score.discovery_timeout,
|event: DiscoveryEvent| -> Result<(), String> {

View File

@ -37,21 +37,23 @@ impl OKDBootstrapDhcpScore {
.clone(),
});
// TODO refactor this so it is not copy pasted from dhcp.rs
Self {
dhcp_score: DhcpScore::new(
host_binding,
// TODO : we should add a tftp server to the topology instead of relying on the
// router address, this is leaking implementation details
Some(topology.router.get_gateway()),
None, // To allow UEFI boot we cannot provide a legacy file
Some("undionly.kpxe".to_string()),
Some("ipxe.efi".to_string()),
Some(format!(
"http://{}:8080/boot.ipxe",
topology.router.get_gateway()
)),
),
}
todo!("Add dhcp range")
// Self {
// dhcp_score: DhcpScore::new(
// host_binding,
// // TODO : we should add a tftp server to the topology instead of relying on the
// // router address, this is leaking implementation details
// Some(topology.router.get_gateway()),
// None, // To allow UEFI boot we cannot provide a legacy file
// Some("undionly.kpxe".to_string()),
// Some("ipxe.efi".to_string()),
// Some(format!(
// "http://{}:8080/boot.ipxe",
// topology.router.get_gateway()
// )),
// (self.),
// ),
// }
}
}

View File

@ -1,3 +1,6 @@
use std::net::Ipv4Addr;
use harmony_types::net::IpAddress;
use serde::Serialize;
use crate::{
@ -44,6 +47,16 @@ impl OKDDhcpScore {
})
});
let dhcp_server_ip = match topology.dhcp_server.get_ip() {
std::net::IpAddr::V4(ipv4_addr) => ipv4_addr,
std::net::IpAddr::V6(_ipv6_addr) => todo!("Support ipv6 someday"),
};
// TODO this could overflow, we should use proper subnet maths here instead of an ip
// address and guessing the subnet size from there
let start = Ipv4Addr::from(u32::from(dhcp_server_ip) + 100);
let end = Ipv4Addr::from(u32::from(dhcp_server_ip) + 150);
Self {
// TODO : we should add a tftp server to the topology instead of relying on the
// router address, this is leaking implementation details
@ -57,6 +70,7 @@ impl OKDDhcpScore {
"http://{}:8080/boot.ipxe",
topology.router.get_gateway()
)),
dhcp_range: (IpAddress::from(start), IpAddress::from(end)),
},
}
}

View File

@ -49,18 +49,23 @@
//! - internal_domain: Internal cluster domain (e.g., cluster.local or harmony.mcd).
use async_trait::async_trait;
use chrono::Duration;
use derive_new::new;
use harmony_types::id::Id;
use log::{error, info};
use log::{error, info, warn};
use serde::{Deserialize, Serialize};
use crate::{
data::Version,
hardware::PhysicalHost,
infra::inventory::InventoryRepositoryFactory,
instrumentation::{HarmonyEvent, instrument},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::{dhcp::DhcpHostBindingScore, http::IPxeMacBootFileScore},
inventory::{HostRole, Inventory},
modules::{
dhcp::DhcpHostBindingScore, http::IPxeMacBootFileScore,
inventory::LaunchDiscoverInventoryAgentScore,
},
score::Score,
topology::{HAClusterTopology, HostBinding},
};
@ -116,12 +121,6 @@ impl OKDInstallationInterpret {
topology: &HAClusterTopology,
) -> Result<(), InterpretError> {
// 1) Prepare DNS and DHCP lease registration (optional)
let dns_score = OKDSetup01InventoryDnsScore::new(
self.score.internal_domain.clone(),
self.score.public_domain.clone(),
Some(true), // register_dhcp_leases
);
dns_score.interpret(inventory, topology).await?;
// 2) Serve default iPXE + Kickstart and poll discovery
let discovery_score = OKDSetup01InventoryScore::new(self.score.lan_cidr.clone());
@ -239,79 +238,6 @@ impl Interpret<HAClusterTopology> for OKDInstallationInterpret {
}
}
// -------------------------------------------------------------------------------------------------
// Step 01: Inventory DNS setup
// - Keep DHCP simple; optionally register dynamic leases into DNS.
// - Ensure base records for internal/public domains (api/api-int/apps wildcard).
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, new)]
struct OKDSetup01InventoryDnsScore {
internal_domain: String,
public_domain: String,
register_dhcp_leases: Option<bool>,
}
impl Score<HAClusterTopology> for OKDSetup01InventoryDnsScore {
fn create_interpret(&self) -> Box<dyn Interpret<HAClusterTopology>> {
Box::new(OKDSetup01InventoryDnsInterpret::new(self.clone()))
}
fn name(&self) -> String {
"OKDSetup01InventoryDnsScore".to_string()
}
}
#[derive(Debug, Clone)]
struct OKDSetup01InventoryDnsInterpret {
score: OKDSetup01InventoryDnsScore,
version: Version,
status: InterpretStatus,
}
impl OKDSetup01InventoryDnsInterpret {
pub fn new(score: OKDSetup01InventoryDnsScore) -> Self {
let version = Version::from("1.0.0").unwrap();
Self {
version,
score,
status: InterpretStatus::QUEUED,
}
}
}
#[async_trait]
impl Interpret<HAClusterTopology> for OKDSetup01InventoryDnsInterpret {
fn get_name(&self) -> InterpretName {
InterpretName::Custom("OKDSetup01InventoryDns")
}
fn get_version(&self) -> Version {
self.version.clone()
}
fn get_status(&self) -> InterpretStatus {
self.status.clone()
}
fn get_children(&self) -> Vec<Id> {
vec![]
}
async fn execute(
&self,
_inventory: &Inventory,
topology: &HAClusterTopology,
) -> Result<Outcome, InterpretError> {
info!("Ensuring base DNS and DHCP lease registration for discovery phase");
error!("TODO setup ipxe score here and launch inventory agent");
Ok(Outcome::new(
InterpretStatus::SUCCESS,
"Inventory DNS prepared".into(),
))
}
}
// -------------------------------------------------------------------------------------------------
// Step 01: Inventory (default PXE + Kickstart in RAM + Rust agent)
// - This score exposes/ensures the default inventory assets and waits for discoveries.
@ -319,9 +245,7 @@ impl Interpret<HAClusterTopology> for OKDSetup01InventoryDnsInterpret {
// -------------------------------------------------------------------------------------------------
#[derive(Debug, Clone, Serialize, new)]
struct OKDSetup01InventoryScore {
lan_cidr: String,
}
struct OKDSetup01InventoryScore {}
impl Score<HAClusterTopology> for OKDSetup01InventoryScore {
fn create_interpret(&self) -> Box<dyn Interpret<HAClusterTopology>> {
@ -349,31 +273,6 @@ impl OKDSetup01InventoryInterpret {
status: InterpretStatus::QUEUED,
}
}
async fn ensure_inventory_assets(
&self,
topology: &HAClusterTopology,
) -> Result<(), InterpretError> {
// Placeholder: push or verify iPXE default, Kickstart, and Rust inventory agent are hosted.
// Real implementation: publish to the PXE/HTTP server via the topology.
info!(
"[Inventory] Ensuring default iPXE, Kickstart, and inventory agent are available for LAN {}",
self.score.lan_cidr
);
// topology.publish_http_asset(…) ?
Ok(())
}
async fn discover_nodes(&self) -> Result<usize, InterpretError> {
// Placeholder: implement Harmony discovery logic (scan/pull/push mode).
// Returns number of newly discovered nodes.
info!(
"[Inventory] Scanning for inventory agents in {}",
self.score.lan_cidr
);
// In practice, this would query harmony_composer or a local registry store.
Ok(3)
}
}
#[async_trait]
@ -396,16 +295,99 @@ impl Interpret<HAClusterTopology> for OKDSetup01InventoryInterpret {
async fn execute(
&self,
_inventory: &Inventory,
inventory: &Inventory,
topology: &HAClusterTopology,
) -> Result<Outcome, InterpretError> {
self.ensure_inventory_assets(topology).await?;
let count = self.discover_nodes().await?;
info!("[Inventory] Discovered {count} nodes");
info!(
"Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:8080/inventory`"
);
LaunchDiscoverInventoryAgentScore {
discovery_timeout: Some(60),
}
.interpret(inventory, topology)
.await?;
let bootstrap_host: PhysicalHost;
let host_repo = InventoryRepositoryFactory::build().await?;
loop {
let all_hosts = host_repo.get_all_hosts().await?;
if all_hosts.is_empty() {
warn!("No discovered hosts found yet. Waiting for hosts to appear...");
// Sleep to avoid spamming the user and logs while waiting for nodes.
tokio::time::sleep(std::time::Duration::from_secs(5));
continue;
}
let ans = inquire::Select::new(
"Select the node to be used as the bootstrap node:",
all_hosts,
)
.with_help_message("Press Esc to refresh the list of discovered hosts")
.prompt();
match ans {
Ok(choice) => {
info!("Selected {} as the bootstrap node.", choice.summary());
host_repo
.save_role_mapping(&HostRole::Bootstrap, &choice)
.await?;
bootstrap_host = choice;
break;
}
Err(inquire::InquireError::OperationCanceled) => {
info!("Refresh requested. Fetching list of discovered hosts again...");
continue;
}
Err(e) => {
error!("Failed to select bootstrap node: {}", e);
return Err(InterpretError::new(format!(
"Could not select host : {}",
e.to_string()
)));
}
}
}
Ok(Outcome::new(
InterpretStatus::SUCCESS,
format!("Inventory phase complete. Nodes discovered: {count}"),
format!(
"Found and assigned bootstrap node: {}",
bootstrap_host.summary()
),
))
// info!(
// "Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:8080/inventory`"
// );
// LaunchDiscoverInventoryAgentScore {
// discovery_timeout: Some(60),
// }
// .interpret(inventory, topology)
// .await?;
//
// // TODO write a loop
// let bootstrap_host: PhysicalHost;
// let mut found_bootstrap_host = false;
// let host_repo = InventoryRepositoryFactory::build().await?;
// while !found_bootstrap_host {
// let all_hosts = host_repo.get_all_hosts().await?;
// // TODO use inquire to select among the current hosts, tell the user to cancel if he
// // wants to update the list. I believe inquire::Select is the correct option here
// //
// // The options are all_hosts, all_hosts is of type Vec<PhysicalHost> and PhysicalHost
// // has a human friendly `summary() -> String` method that is perfect to have the user
// // choose
// //
// // once the user has chosen one, call host_repo.save_role_mapping(Role::Bootstrap,
// // host.id).await?;
// bootstrap_host = todo!();
// }
//
// Ok(Outcome::new(
// InterpretStatus::SUCCESS,
// format!("Found bootstrap node : {}", bootstrap_host.summary()),
// ))
}
}

View File

@ -1,9 +1,9 @@
use askama::Template;
use async_trait::async_trait;
use derive_new::new;
use harmony_types::net::Url;
use harmony_types::net::{IpAddress, Url};
use serde::Serialize;
use std::net::IpAddr;
use std::net::{IpAddr, Ipv4Addr};
use crate::{
data::{FileContent, FilePath, Version},
@ -46,6 +46,16 @@ impl<T: Topology + DhcpServer + TftpServer + HttpServer + Router> Interpret<T> f
) -> Result<Outcome, InterpretError> {
let gateway_ip = topology.get_gateway();
let dhcp_server_ip = match DhcpServer::get_ip(topology) {
std::net::IpAddr::V4(ipv4_addr) => ipv4_addr,
std::net::IpAddr::V6(_ipv6_addr) => todo!("Support ipv6 someday"),
};
// TODO this could overflow, we should use proper subnet maths here instead of an ip
// address and guessing the subnet size from there
let start = Ipv4Addr::from(u32::from(dhcp_server_ip) + 100);
let end = Ipv4Addr::from(u32::from(dhcp_server_ip) + 150);
let scores: Vec<Box<dyn Score<T>>> = vec![
Box::new(DhcpScore {
host_binding: vec![],
@ -54,6 +64,7 @@ impl<T: Topology + DhcpServer + TftpServer + HttpServer + Router> Interpret<T> f
filename: Some("undionly.kpxe".to_string()),
filename64: Some("ipxe.efi".to_string()),
filenameipxe: Some(format!("http://{gateway_ip}:8080/boot.ipxe").to_string()),
dhcp_range: (IpAddress::from(start), IpAddress::from(end)),
}),
Box::new(TftpScore {
files_to_serve: Url::LocalFolder("./data/pxe/okd/tftpboot/".to_string()),

View File

@ -9,6 +9,7 @@ use config::INFISICAL_ENVIRONMENT;
use config::INFISICAL_PROJECT_ID;
use config::INFISICAL_URL;
use config::SECRET_STORE;
use log::debug;
use serde::{Serialize, de::DeserializeOwned};
use std::fmt;
use store::InfisicalSecretStore;
@ -101,6 +102,7 @@ impl SecretManager {
/// Retrieves and deserializes a secret.
pub async fn get<T: Secret>() -> Result<T, SecretStoreError> {
let manager = get_secret_manager().await;
debug!("Getting secret ns {} key {}", &manager.namespace, T::KEY);
let raw_value = manager.store.get_raw(&manager.namespace, T::KEY).await?;
serde_json::from_slice(&raw_value).map_err(|e| SecretStoreError::Deserialization {
key: T::KEY.to_string(),

View File

@ -1,5 +1,5 @@
use async_trait::async_trait;
use log::info;
use log::{debug, info};
use std::path::{Path, PathBuf};
use crate::{SecretStore, SecretStoreError};
@ -24,7 +24,7 @@ impl SecretStore for LocalFileSecretStore {
.join("secrets");
let file_path = Self::get_file_path(&data_dir, ns, key);
info!(
debug!(
"LOCAL_STORE: Getting key '{key}' from namespace '{ns}' at {}",
file_path.display()
);

View File

@ -1,7 +1,7 @@
// dnsmasq.rs
use crate::modules::dhcp::DhcpError;
use log::{debug, info, warn};
use opnsense_config_xml::dnsmasq::{DnsMasq, DnsmasqHost};
use opnsense_config_xml::dnsmasq::{DhcpRange, DnsMasq, DnsmasqHost}; // Assuming DhcpRange is defined in opnsense_config_xml::dnsmasq
use opnsense_config_xml::{MaybeString, StaticMap};
use std::collections::HashSet;
use std::net::Ipv4Addr;
@ -224,6 +224,36 @@ impl<'a> DhcpConfigDnsMasq<'a> {
Ok(static_maps)
}
pub async fn set_dhcp_range(&mut self, start: &str, end: &str) -> Result<(), DhcpError> {
let dnsmasq = self.get_dnsmasq();
let ranges = &mut dnsmasq.dhcp_ranges;
// Assuming DnsMasq has dhcp_ranges: Vec<DhcpRange>
// Find existing range for "lan" interface
if let Some(range) = ranges
.iter_mut()
.find(|r| r.interface == Some("lan".to_string()))
{
// Update existing range
range.start_addr = Some(start.to_string());
range.end_addr = Some(end.to_string());
} else {
// Create new range
let new_range = DhcpRange {
uuid: Some(Uuid::new_v4().to_string()),
interface: Some("lan".to_string()),
start_addr: Some(start.to_string()),
end_addr: Some(end.to_string()),
domain_type: Some("range".to_string()),
nosync: Some(0),
..Default::default()
};
ranges.push(new_range);
}
Ok(())
}
pub async fn set_pxe_options(
&self,
tftp_ip: Option<String>,

File diff suppressed because it is too large Load Diff