diff --git a/Cargo.lock b/Cargo.lock index ed76c0ef..d17166c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1828,6 +1828,40 @@ dependencies = [ "url", ] +[[package]] +name = "example-k8s-drain-node" +version = "0.1.0" +dependencies = [ + "assert_cmd", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "inquire 0.7.5", + "log", + "tokio", + "url", +] + +[[package]] +name = "example-k8s-write-file-on-node" +version = "0.1.0" +dependencies = [ + "assert_cmd", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "inquire 0.7.5", + "log", + "tokio", + "url", +] + [[package]] name = "example-kube-rs" version = "0.1.0" diff --git a/adr/019-Network-bond-setup.md b/adr/019-Network-bond-setup.md new file mode 100644 index 00000000..2f4cff84 --- /dev/null +++ b/adr/019-Network-bond-setup.md @@ -0,0 +1,65 @@ +# Architecture Decision Record: Network Bonding Configuration via External Automation + +Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay + +Initial Date: 2026-02-13 + +Last Updated Date: 2026-02-13 + +## Status + +Accepted + +## Context + +We need to configure LACP bonds on 10GbE interfaces across all worker nodes in the OpenShift cluster. A significant challenge is that interface names (e.g., `enp1s0f0` vs `ens1f0`) vary across different hardware nodes. + +The standard OpenShift mechanism (MachineConfig) applies identical configurations to all nodes in a MachineConfigPool. Since the interface names differ, a single static MachineConfig cannot target specific physical devices across the entire cluster without complex workarounds. + +## Decision + +We will use the existing "Harmony" automation tool to generate and apply host-specific NetworkManager configuration files directly to the nodes. + +1. Harmony will generate the specific `.nmconnection` files for the bond and slaves based on its inventory of interface names. +2. Files will be pushed to `/etc/NetworkManager/system-connections/` on each node. +3. Configuration will be applied via `nmcli` reload or a node reboot. + +## Rationale + +* **Inventory Awareness:** Harmony already possesses the specific interface mapping data for each host. +* **Persistence:** Fedora CoreOS/SCOS allows writing to `/etc`, and these files persist across reboots and OS upgrades (rpm-ostree updates). +* **Avoids Complexity:** This approach avoids the operational overhead of creating unique MachineConfigPools for every single host or hardware variant. +* **Safety:** Unlike wildcard matching, this ensures explicit interface selection, preventing accidental bonding of reserved interfaces (e.g., future separation of Ceph storage traffic). + +## Consequences + +**Pros:** +* Precise, per-host configuration without polluting the Kubernetes API with hundreds of MachineConfigs. +* Standard Linux networking behavior; easy to debug locally. +* Prevents accidental interface capture (unlike wildcards). + +**Cons:** +* **Loss of Declarative K8s State:** The network config is not managed by the Machine Config Operator (MCO). +* **Node Replacement Friction:** Newly provisioned nodes (replacements) will boot with default config. Harmony must be run against new nodes manually or via a hook before they can fully join the cluster workload. + +## Alternatives considered + +1. **Wildcard Matching in NetworkManager (e.g., `interface-name=enp*`):** + * *Pros:* Single MachineConfig for the whole cluster. + * *Cons:* Rejected because it is too broad. It risks capturing interfaces intended for other purposes (e.g., splitting storage and cluster networks later). + +2. **"Kitchen Sink" Configuration:** + * *Pros:* Single file listing every possible interface name as a slave. + * *Cons:* "Dirty" configuration; results in many inactive connections on every host; brittle if new naming schemes appear. + +3. **Per-Host MachineConfig:** + * *Pros:* Fully declarative within OpenShift. + * *Cons:* Requires a unique `MachineConfigPool` per host, which is an anti-pattern and unmaintainable at scale. + +4. **On-boot Generation Script:** + * *Pros:* Dynamic detection. + * *Cons:* Increases boot complexity; harder to debug if the script fails during startup. + +## Additional Notes + +While `/etc` is writable and persistent on CoreOS, this configuration falls outside the "Day 1" Ignition process. Operational runbooks must be updated to ensure Harmony runs on any node replacement events. diff --git a/brocade/examples/main.rs b/brocade/examples/main.rs index ae47de5e..f6c0d086 100644 --- a/brocade/examples/main.rs +++ b/brocade/examples/main.rs @@ -1,7 +1,7 @@ use std::net::{IpAddr, Ipv4Addr}; use brocade::{BrocadeOptions, ssh}; -use harmony_secret::Secret; +use harmony_secret::{Secret, SecretManager}; use harmony_types::switch::PortLocation; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -21,16 +21,14 @@ async fn main() { // let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 4, 11)); // brocade @ st let switch_addresses = vec![ip]; - // let config = SecretManager::get_or_prompt::() - // .await - // .unwrap(); + let config = SecretManager::get_or_prompt::() + .await + .unwrap(); let brocade = brocade::init( &switch_addresses, - // &config.username, - // &config.password, - "admin", - "password", + &config.username, + &config.password, BrocadeOptions { dry_run: true, ssh: ssh::SshOptions { diff --git a/examples/k8s_drain_node/Cargo.toml b/examples/k8s_drain_node/Cargo.toml new file mode 100644 index 00000000..d8ded7f6 --- /dev/null +++ b/examples/k8s_drain_node/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "example-k8s-drain-node" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +cidr.workspace = true +tokio.workspace = true +harmony_macros = { path = "../../harmony_macros" } +log.workspace = true +env_logger.workspace = true +url.workspace = true +assert_cmd = "2.0.16" +inquire.workspace = true diff --git a/examples/k8s_drain_node/src/main.rs b/examples/k8s_drain_node/src/main.rs new file mode 100644 index 00000000..71cf4b37 --- /dev/null +++ b/examples/k8s_drain_node/src/main.rs @@ -0,0 +1,61 @@ +use std::time::Duration; + +use harmony::topology::k8s::{DrainOptions, K8sClient}; +use log::{info, trace}; + +#[tokio::main] +async fn main() { + env_logger::init(); + let k8s = K8sClient::try_default().await.unwrap(); + let nodes = k8s.get_nodes(None).await.unwrap(); + trace!("Got nodes : {nodes:#?}"); + let node_names = nodes + .iter() + .map(|n| n.metadata.name.as_ref().unwrap()) + .collect::>(); + + info!("Got nodes : {:?}", node_names); + + let node_name = inquire::Select::new("What node do you want to operate on?", node_names) + .prompt() + .unwrap(); + + let drain = inquire::Confirm::new("Do you wish to drain the node now ?") + .prompt() + .unwrap(); + + if drain { + let mut options = DrainOptions::default_ignore_daemonset_delete_emptydir_data(); + options.timeout = Duration::from_secs(1); + k8s.drain_node(&node_name, &options).await.unwrap(); + + info!("Node {node_name} successfully drained"); + } + + let uncordon = + inquire::Confirm::new("Do you wish to uncordon node to resume scheduling workloads now?") + .prompt() + .unwrap(); + + if uncordon { + info!("Uncordoning node {node_name}"); + k8s.uncordon_node(node_name).await.unwrap(); + info!("Node {node_name} uncordoned"); + } + + let reboot = inquire::Confirm::new("Do you wish to reboot node now?") + .prompt() + .unwrap(); + + if reboot { + k8s.reboot_node( + &node_name, + &DrainOptions::default_ignore_daemonset_delete_emptydir_data(), + Duration::from_secs(3600), + ) + .await + .unwrap(); + } + + info!("All done playing with nodes, happy harmonizing!"); +} diff --git a/examples/k8s_write_file_on_node/Cargo.toml b/examples/k8s_write_file_on_node/Cargo.toml new file mode 100644 index 00000000..b7354418 --- /dev/null +++ b/examples/k8s_write_file_on_node/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "example-k8s-write-file-on-node" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +cidr.workspace = true +tokio.workspace = true +harmony_macros = { path = "../../harmony_macros" } +log.workspace = true +env_logger.workspace = true +url.workspace = true +assert_cmd = "2.0.16" +inquire.workspace = true diff --git a/examples/k8s_write_file_on_node/src/main.rs b/examples/k8s_write_file_on_node/src/main.rs new file mode 100644 index 00000000..f37e1716 --- /dev/null +++ b/examples/k8s_write_file_on_node/src/main.rs @@ -0,0 +1,45 @@ +use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile}; +use log::{info, trace}; + +#[tokio::main] +async fn main() { + env_logger::init(); + let k8s = K8sClient::try_default().await.unwrap(); + let nodes = k8s.get_nodes(None).await.unwrap(); + trace!("Got nodes : {nodes:#?}"); + let node_names = nodes + .iter() + .map(|n| n.metadata.name.as_ref().unwrap()) + .collect::>(); + + info!("Got nodes : {:?}", node_names); + + let node = inquire::Select::new("What node do you want to write file to?", node_names) + .prompt() + .unwrap(); + + let path = inquire::Text::new("File path on node").prompt().unwrap(); + let content = inquire::Text::new("File content").prompt().unwrap(); + + let node_file = NodeFile { + path: path, + content: content, + mode: 0o600, + }; + + k8s.write_files_to_node(&node, &vec![node_file.clone()]) + .await + .unwrap(); + + let cmd = inquire::Text::new("Command to run on node") + .prompt() + .unwrap(); + k8s.run_privileged_command_on_node(&node, &cmd) + .await + .unwrap(); + + info!( + "File {} mode {} written in node {node}", + node_file.path, node_file.mode + ); +} diff --git a/harmony/src/domain/topology/ha_cluster.rs b/harmony/src/domain/topology/ha_cluster.rs index 7ca3340a..b5a17d2b 100644 --- a/harmony/src/domain/topology/ha_cluster.rs +++ b/harmony/src/domain/topology/ha_cluster.rs @@ -1,5 +1,4 @@ use async_trait::async_trait; -use brocade::PortOperatingMode; use harmony_macros::ip; use harmony_types::{ id::Id, @@ -301,10 +300,10 @@ impl Switch for HAClusterTopology { Ok(()) } - async fn clear_port_channel(&self, ids: &Vec) -> Result<(), SwitchError> { + async fn clear_port_channel(&self, _ids: &Vec) -> Result<(), SwitchError> { todo!() } - async fn configure_interface(&self, ports: &Vec) -> Result<(), SwitchError> { + async fn configure_interface(&self, _ports: &Vec) -> Result<(), SwitchError> { todo!() } } @@ -322,7 +321,15 @@ impl NetworkManager for HAClusterTopology { self.network_manager().await.configure_bond(config).await } - //TODO add snmp here + async fn configure_bond_on_primary_interface( + &self, + config: &HostNetworkConfig, + ) -> Result<(), NetworkError> { + self.network_manager() + .await + .configure_bond_on_primary_interface(config) + .await + } } #[async_trait] @@ -562,10 +569,10 @@ impl SwitchClient for DummyInfra { ) -> Result { unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA) } - async fn clear_port_channel(&self, ids: &Vec) -> Result<(), SwitchError> { + async fn clear_port_channel(&self, _ids: &Vec) -> Result<(), SwitchError> { todo!() } - async fn configure_interface(&self, ports: &Vec) -> Result<(), SwitchError> { + async fn configure_interface(&self, _ports: &Vec) -> Result<(), SwitchError> { todo!() } } diff --git a/harmony/src/domain/topology/k8s.rs b/harmony/src/domain/topology/k8s.rs deleted file mode 100644 index 5738c68b..00000000 --- a/harmony/src/domain/topology/k8s.rs +++ /dev/null @@ -1,1065 +0,0 @@ -use std::{collections::HashMap, time::Duration}; - -use derive_new::new; -use k8s_openapi::{ - ClusterResourceScope, NamespaceResourceScope, - api::{ - apps::v1::Deployment, - core::v1::{Node, Pod, ServiceAccount}, - }, - apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition, - apimachinery::pkg::version::Info, -}; -use kube::{ - Client, Config, Discovery, Error, Resource, - api::{ - Api, AttachParams, DeleteParams, ListParams, ObjectList, Patch, PatchParams, ResourceExt, - }, - config::{KubeConfigOptions, Kubeconfig}, - core::{DynamicResourceScope, ErrorResponse}, - discovery::{ApiCapabilities, Scope}, - error::DiscoveryError, - runtime::{reflector::Lookup, wait::Condition}, -}; -use kube::{api::DynamicObject, runtime::conditions}; -use kube::{ - api::{ApiResource, GroupVersionKind}, - runtime::wait::await_condition, -}; -use log::{debug, error, trace, warn}; -use serde::{Serialize, de::DeserializeOwned}; -use serde_json::{Value, json}; -use similar::TextDiff; -use tokio::{io::AsyncReadExt, time::sleep}; -use url::Url; - -#[derive(new, Clone)] -pub struct K8sClient { - client: Client, -} - -impl Serialize for K8sClient { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl std::fmt::Debug for K8sClient { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // This is a poor man's debug implementation for now as kube::Client does not provide much - // useful information - f.write_fmt(format_args!( - "K8sClient {{ kube client using default namespace {} }}", - self.client.default_namespace() - )) - } -} - -impl K8sClient { - pub async fn try_default() -> Result { - Ok(Self { - client: Client::try_default().await?, - }) - } - - /// Returns true if any deployment in the given namespace matching the label selector - /// has status.availableReplicas > 0 (or condition Available=True). - pub async fn has_healthy_deployment_with_label( - &self, - namespace: &str, - label_selector: &str, - ) -> Result { - let api: Api = Api::namespaced(self.client.clone(), namespace); - let lp = ListParams::default().labels(label_selector); - let list = api.list(&lp).await?; - for d in list.items { - // Check AvailableReplicas > 0 or Available condition - let available = d - .status - .as_ref() - .and_then(|s| s.available_replicas) - .unwrap_or(0); - if available > 0 { - return Ok(true); - } - // Fallback: scan conditions - if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) { - if conds - .iter() - .any(|c| c.type_ == "Available" && c.status == "True") - { - return Ok(true); - } - } - } - Ok(false) - } - - /// Cluster-wide: returns namespaces that have at least one healthy deployment - /// matching the label selector (equivalent to kubectl -A -l ...). - pub async fn list_namespaces_with_healthy_deployments( - &self, - label_selector: &str, - ) -> Result, Error> { - let api: Api = Api::all(self.client.clone()); - let lp = ListParams::default().labels(label_selector); - let list = api.list(&lp).await?; - - let mut healthy_ns: HashMap = HashMap::new(); - for d in list.items { - let ns = match d.metadata.namespace.clone() { - Some(n) => n, - None => continue, - }; - let available = d - .status - .as_ref() - .and_then(|s| s.available_replicas) - .unwrap_or(0); - let is_healthy = if available > 0 { - true - } else { - d.status - .as_ref() - .and_then(|s| s.conditions.as_ref()) - .map(|conds| { - conds - .iter() - .any(|c| c.type_ == "Available" && c.status == "True") - }) - .unwrap_or(false) - }; - if is_healthy { - healthy_ns.insert(ns, true); - } - } - - Ok(healthy_ns.into_keys().collect()) - } - - /// Get the application-controller ServiceAccount name (fallback to default) - pub async fn get_controller_service_account_name( - &self, - ns: &str, - ) -> Result, Error> { - let api: Api = Api::namespaced(self.client.clone(), ns); - let lp = ListParams::default().labels("app.kubernetes.io/component=controller"); - let list = api.list(&lp).await?; - if let Some(dep) = list.items.get(0) { - if let Some(sa) = dep - .spec - .as_ref() - .and_then(|ds| ds.template.spec.as_ref()) - .and_then(|ps| ps.service_account_name.clone()) - { - return Ok(Some(sa)); - } - } - Ok(None) - } - - // List ClusterRoleBindings dynamically and return as JSON values - pub async fn list_clusterrolebindings_json(&self) -> Result, Error> { - let gvk = kube::api::GroupVersionKind::gvk( - "rbac.authorization.k8s.io", - "v1", - "ClusterRoleBinding", - ); - let ar = kube::api::ApiResource::from_gvk(&gvk); - let api: Api = Api::all_with(self.client.clone(), &ar); - let crbs = api.list(&ListParams::default()).await?; - let mut out = Vec::new(); - for o in crbs { - let v = serde_json::to_value(&o).unwrap_or(Value::Null); - out.push(v); - } - Ok(out) - } - - /// Determine if Argo controller in ns has cluster-wide permissions via CRBs - // TODO This does not belong in the generic k8s client, should be refactored at some point - pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result { - let crbs = self.list_clusterrolebindings_json().await?; - let sa_user = format!("system:serviceaccount:{}:{}", ns, sa); - for crb in crbs { - if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) { - for subj in subjects { - let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or(""); - let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or(""); - let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or(""); - if (kind == "ServiceAccount" && name == sa && subj_ns == ns) - || (kind == "User" && name == sa_user) - { - return Ok(true); - } - } - } - } - Ok(false) - } - - pub async fn has_crd(&self, name: &str) -> Result { - let api: Api = Api::all(self.client.clone()); - let lp = ListParams::default().fields(&format!("metadata.name={}", name)); - let crds = api.list(&lp).await?; - Ok(!crds.items.is_empty()) - } - - pub async fn service_account_api(&self, namespace: &str) -> Api { - let api: Api = Api::namespaced(self.client.clone(), namespace); - api - } - - pub async fn get_apiserver_version(&self) -> Result { - let client: Client = self.client.clone(); - let version_info: Info = client.apiserver_version().await?; - Ok(version_info) - } - - pub async fn discovery(&self) -> Result { - let discovery: Discovery = Discovery::new(self.client.clone()).run().await?; - Ok(discovery) - } - - pub async fn get_resource_json_value( - &self, - name: &str, - namespace: Option<&str>, - gvk: &GroupVersionKind, - ) -> Result { - let gvk = ApiResource::from_gvk(gvk); - let resource: Api = if let Some(ns) = namespace { - Api::namespaced_with(self.client.clone(), ns, &gvk) - } else { - Api::default_namespaced_with(self.client.clone(), &gvk) - }; - - resource.get(name).await - } - - pub async fn get_secret_json_value( - &self, - name: &str, - namespace: Option<&str>, - ) -> Result { - self.get_resource_json_value( - name, - namespace, - &GroupVersionKind { - group: "".to_string(), - version: "v1".to_string(), - kind: "Secret".to_string(), - }, - ) - .await - } - - pub async fn get_deployment( - &self, - name: &str, - namespace: Option<&str>, - ) -> Result, Error> { - let deps: Api = if let Some(ns) = namespace { - debug!("getting namespaced deployment"); - Api::namespaced(self.client.clone(), ns) - } else { - debug!("getting default namespace deployment"); - Api::default_namespaced(self.client.clone()) - }; - - debug!("getting deployment {} in ns {}", name, namespace.unwrap()); - deps.get_opt(name).await - } - - pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result, Error> { - let pods: Api = if let Some(ns) = namespace { - Api::namespaced(self.client.clone(), ns) - } else { - Api::default_namespaced(self.client.clone()) - }; - - pods.get_opt(name).await - } - - pub async fn scale_deployment( - &self, - name: &str, - namespace: Option<&str>, - replicas: u32, - ) -> Result<(), Error> { - let deployments: Api = if let Some(ns) = namespace { - Api::namespaced(self.client.clone(), ns) - } else { - Api::default_namespaced(self.client.clone()) - }; - - let patch = json!({ - "spec": { - "replicas": replicas - } - }); - let pp = PatchParams::default(); - let scale = Patch::Merge(&patch); - deployments.patch_scale(name, &pp, &scale).await?; - Ok(()) - } - - pub async fn delete_deployment( - &self, - name: &str, - namespace: Option<&str>, - ) -> Result<(), Error> { - let deployments: Api = if let Some(ns) = namespace { - Api::namespaced(self.client.clone(), ns) - } else { - Api::default_namespaced(self.client.clone()) - }; - let delete_params = DeleteParams::default(); - deployments.delete(name, &delete_params).await?; - Ok(()) - } - - pub async fn wait_until_deployment_ready( - &self, - name: &str, - namespace: Option<&str>, - timeout: Option, - ) -> Result<(), String> { - let api: Api; - - if let Some(ns) = namespace { - api = Api::namespaced(self.client.clone(), ns); - } else { - api = Api::default_namespaced(self.client.clone()); - } - - let establish = await_condition(api, name, conditions::is_deployment_completed()); - let timeout = timeout.unwrap_or(Duration::from_secs(120)); - let res = tokio::time::timeout(timeout, establish).await; - - if res.is_ok() { - Ok(()) - } else { - Err("timed out while waiting for deployment".to_string()) - } - } - - pub async fn wait_for_pod_ready( - &self, - pod_name: &str, - namespace: Option<&str>, - ) -> Result<(), Error> { - let mut elapsed = 0; - let interval = 5; // seconds between checks - let timeout_secs = 120; - loop { - let pod = self.get_pod(pod_name, namespace).await?; - - if let Some(p) = pod { - if let Some(status) = p.status { - if let Some(phase) = status.phase { - if phase.to_lowercase() == "running" { - return Ok(()); - } - } - } - } - - if elapsed >= timeout_secs { - return Err(Error::Discovery(DiscoveryError::MissingResource(format!( - "'{}' in ns '{}' did not become ready within {}s", - pod_name, - namespace.unwrap(), - timeout_secs - )))); - } - - sleep(Duration::from_secs(interval)).await; - elapsed += interval; - } - } - - /// Will execute a commond in the first pod found that matches the specified label - /// '{label}={name}' - pub async fn exec_app_capture_output( - &self, - name: String, - label: String, - namespace: Option<&str>, - command: Vec<&str>, - ) -> Result { - let api: Api; - - if let Some(ns) = namespace { - api = Api::namespaced(self.client.clone(), ns); - } else { - api = Api::default_namespaced(self.client.clone()); - } - let pod_list = api - .list(&ListParams::default().labels(format!("{label}={name}").as_str())) - .await - .expect("couldn't get list of pods"); - - let res = api - .exec( - pod_list - .items - .first() - .expect("couldn't get pod") - .name() - .expect("couldn't get pod name") - .into_owned() - .as_str(), - command, - &AttachParams::default().stdout(true).stderr(true), - ) - .await; - match res { - Err(e) => Err(e.to_string()), - Ok(mut process) => { - let status = process - .take_status() - .expect("Couldn't get status") - .await - .expect("Couldn't unwrap status"); - - if let Some(s) = status.status { - let mut stdout_buf = String::new(); - if let Some(mut stdout) = process.stdout() { - stdout - .read_to_string(&mut stdout_buf) - .await - .map_err(|e| format!("Failed to get status stdout {e}"))?; - } - debug!("Status: {} - {:?}", s, status.details); - if s == "Success" { - Ok(stdout_buf) - } else { - Err(s) - } - } else { - Err("Couldn't get inner status of pod exec".to_string()) - } - } - } - } - - /// Will execute a command in the first pod found that matches the label `app.kubernetes.io/name={name}` - pub async fn exec_app( - &self, - name: String, - namespace: Option<&str>, - command: Vec<&str>, - ) -> Result<(), String> { - let api: Api; - - if let Some(ns) = namespace { - api = Api::namespaced(self.client.clone(), ns); - } else { - api = Api::default_namespaced(self.client.clone()); - } - let pod_list = api - .list(&ListParams::default().labels(format!("app.kubernetes.io/name={name}").as_str())) - .await - .expect("couldn't get list of pods"); - - let res = api - .exec( - pod_list - .items - .first() - .expect("couldn't get pod") - .name() - .expect("couldn't get pod name") - .into_owned() - .as_str(), - command, - &AttachParams::default(), - ) - .await; - - match res { - Err(e) => Err(e.to_string()), - Ok(mut process) => { - let status = process - .take_status() - .expect("Couldn't get status") - .await - .expect("Couldn't unwrap status"); - - if let Some(s) = status.status { - debug!("Status: {} - {:?}", s, status.details); - if s == "Success" { Ok(()) } else { Err(s) } - } else { - Err("Couldn't get inner status of pod exec".to_string()) - } - } - } - } - - fn get_api_for_dynamic_object( - &self, - object: &DynamicObject, - ns: Option<&str>, - ) -> Result, Error> { - let api_resource = object - .types - .as_ref() - .and_then(|t| { - let parts: Vec<&str> = t.api_version.split('/').collect(); - match parts.as_slice() { - [version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk( - "", version, &t.kind, - ))), - [group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk( - group, version, &t.kind, - ))), - _ => None, - } - }) - .ok_or_else(|| { - Error::BuildRequest(kube::core::request::Error::Validation( - "Invalid apiVersion in DynamicObject {object:#?}".to_string(), - )) - })?; - - match ns { - Some(ns) => Ok(Api::namespaced_with(self.client.clone(), ns, &api_resource)), - None => Ok(Api::default_namespaced_with( - self.client.clone(), - &api_resource, - )), - } - } - - pub async fn apply_dynamic_many( - &self, - resource: &[DynamicObject], - namespace: Option<&str>, - force_conflicts: bool, - ) -> Result, Error> { - let mut result = Vec::new(); - for r in resource.iter() { - result.push(self.apply_dynamic(r, namespace, force_conflicts).await?); - } - - Ok(result) - } - - /// Apply DynamicObject resource to the cluster - pub async fn apply_dynamic( - &self, - resource: &DynamicObject, - namespace: Option<&str>, - force_conflicts: bool, - ) -> Result { - // Build API for this dynamic object - let api = self.get_api_for_dynamic_object(resource, namespace)?; - let name = resource - .metadata - .name - .as_ref() - .ok_or_else(|| { - Error::BuildRequest(kube::core::request::Error::Validation( - "DynamicObject must have metadata.name".to_string(), - )) - })? - .as_str(); - - debug!( - "Applying dynamic resource kind={:?} apiVersion={:?} name='{}' ns={:?}", - resource.types.as_ref().map(|t| &t.kind), - resource.types.as_ref().map(|t| &t.api_version), - name, - namespace - ); - trace!( - "Dynamic resource payload:\n{:#}", - serde_json::to_value(resource).unwrap_or(serde_json::Value::Null) - ); - - // Using same field manager as in apply() - let mut patch_params = PatchParams::apply("harmony"); - patch_params.force = force_conflicts; - - if *crate::config::DRY_RUN { - // Dry-run path: fetch current, show diff, and return appropriate object - match api.get(name).await { - Ok(current) => { - trace!("Received current dynamic value {current:#?}"); - - println!("\nPerforming dry-run for resource: '{}'", name); - - // Serialize current and new, and strip status from current if present - let mut current_yaml = - serde_yaml::to_value(¤t).unwrap_or_else(|_| serde_yaml::Value::Null); - if let Some(map) = current_yaml.as_mapping_mut() { - if map.contains_key(&serde_yaml::Value::String("status".to_string())) { - let removed = - map.remove(&serde_yaml::Value::String("status".to_string())); - trace!("Removed status from current dynamic object: {:?}", removed); - } else { - trace!( - "Did not find status entry for current dynamic object {}/{}", - current.metadata.namespace.as_deref().unwrap_or(""), - current.metadata.name.as_deref().unwrap_or("") - ); - } - } - - let current_yaml = serde_yaml::to_string(¤t_yaml) - .unwrap_or_else(|_| "Failed to serialize current resource".to_string()); - let new_yaml = serde_yaml::to_string(resource) - .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); - - if current_yaml == new_yaml { - println!("No changes detected."); - return Ok(current); - } - - println!("Changes detected:"); - let diff = TextDiff::from_lines(¤t_yaml, &new_yaml); - for change in diff.iter_all_changes() { - let sign = match change.tag() { - similar::ChangeTag::Delete => "-", - similar::ChangeTag::Insert => "+", - similar::ChangeTag::Equal => " ", - }; - print!("{}{}", sign, change); - } - - // Return the incoming resource as the would-be applied state - Ok(resource.clone()) - } - Err(Error::Api(ErrorResponse { code: 404, .. })) => { - println!("\nPerforming dry-run for new resource: '{}'", name); - println!( - "Resource does not exist. It would be created with the following content:" - ); - let new_yaml = serde_yaml::to_string(resource) - .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); - for line in new_yaml.lines() { - println!("+{}", line); - } - Ok(resource.clone()) - } - Err(e) => { - error!("Failed to get dynamic resource '{}': {}", name, e); - Err(e) - } - } - } else { - // Real apply via server-side apply - debug!("Patching (server-side apply) dynamic resource '{}'", name); - api.patch(name, &patch_params, &Patch::Apply(resource)) - .await - .map_err(|e| { - error!("Failed to apply dynamic resource '{}': {}", name, e); - e - }) - } - } - - /// Apply a resource in namespace - /// - /// See `kubectl apply` for more information on the expected behavior of this function - pub async fn apply(&self, resource: &K, namespace: Option<&str>) -> Result - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned + serde::Serialize, - ::Scope: ApplyStrategy, - ::DynamicType: Default, - { - debug!( - "Applying resource {:?} with ns {:?}", - resource.meta().name, - namespace - ); - trace!( - "{:#}", - serde_json::to_value(resource).unwrap_or(serde_json::Value::Null) - ); - - let api: Api = - <::Scope as ApplyStrategy>::get_api(&self.client, namespace); - // api.create(&PostParams::default(), &resource).await - let patch_params = PatchParams::apply("harmony"); - let name = resource - .meta() - .name - .as_ref() - .expect("K8s Resource should have a name"); - - if *crate::config::DRY_RUN { - match api.get(name).await { - Ok(current) => { - trace!("Received current value {current:#?}"); - // The resource exists, so we calculate and display a diff. - println!("\nPerforming dry-run for resource: '{name}'"); - let mut current_yaml = serde_yaml::to_value(¤t).unwrap_or_else(|_| { - panic!("Could not serialize current value : {current:#?}") - }); - if current_yaml.is_mapping() && current_yaml.get("status").is_some() { - let map = current_yaml.as_mapping_mut().unwrap(); - let removed = map.remove_entry("status"); - trace!("Removed status {removed:?}"); - } else { - trace!( - "Did not find status entry for current object {}/{}", - current.meta().namespace.as_ref().unwrap_or(&"".to_string()), - current.meta().name.as_ref().unwrap_or(&"".to_string()) - ); - } - let current_yaml = serde_yaml::to_string(¤t_yaml) - .unwrap_or_else(|_| "Failed to serialize current resource".to_string()); - let new_yaml = serde_yaml::to_string(resource) - .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); - - if current_yaml == new_yaml { - println!("No changes detected."); - // Return the current resource state as there are no changes. - return Ok(current); - } - - println!("Changes detected:"); - let diff = TextDiff::from_lines(¤t_yaml, &new_yaml); - - // Iterate over the changes and print them in a git-like diff format. - for change in diff.iter_all_changes() { - let sign = match change.tag() { - similar::ChangeTag::Delete => "-", - similar::ChangeTag::Insert => "+", - similar::ChangeTag::Equal => " ", - }; - print!("{sign}{change}"); - } - // In a dry run, we return the new resource state that would have been applied. - Ok(resource.clone()) - } - Err(Error::Api(ErrorResponse { code: 404, .. })) => { - // The resource does not exist, so the "diff" is the entire new resource. - println!("\nPerforming dry-run for new resource: '{name}'"); - println!( - "Resource does not exist. It would be created with the following content:" - ); - let new_yaml = serde_yaml::to_string(resource) - .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); - - // Print each line of the new resource with a '+' prefix. - for line in new_yaml.lines() { - println!("+{line}"); - } - // In a dry run, we return the new resource state that would have been created. - Ok(resource.clone()) - } - Err(e) => { - // Another API error occurred. - error!("Failed to get resource '{name}': {e}"); - Err(e) - } - } - } else { - return api - .patch(name, &patch_params, &Patch::Apply(resource)) - .await; - } - } - - pub async fn apply_many(&self, resource: &[K], ns: Option<&str>) -> Result, Error> - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned + serde::Serialize, - ::Scope: ApplyStrategy, - ::DynamicType: Default, - { - let mut result = Vec::new(); - for r in resource.iter() { - let apply_result = self.apply(r, ns).await; - if apply_result.is_err() { - // NOTE : We should be careful about this one, it may leak sensitive information in - // logs - // Maybe just reducing it to debug would be enough as we already know debug logs - // are unsafe. - // But keeping it at warn makes it much easier to understand what is going on. So be it for now. - warn!( - "Failed to apply k8s resource : {}", - serde_json::to_string_pretty(r).map_err(|e| Error::SerdeError(e))? - ); - } - - result.push(apply_result?); - } - - Ok(result) - } - - pub async fn apply_yaml_many( - &self, - #[allow(clippy::ptr_arg)] yaml: &Vec, - ns: Option<&str>, - ) -> Result<(), Error> { - for y in yaml.iter() { - self.apply_yaml(y, ns).await?; - } - Ok(()) - } - - pub async fn apply_yaml( - &self, - yaml: &serde_yaml::Value, - ns: Option<&str>, - ) -> Result<(), Error> { - let obj: DynamicObject = serde_yaml::from_value(yaml.clone()).expect("TODO do not unwrap"); - let name = obj.metadata.name.as_ref().expect("YAML must have a name"); - - let api_version = yaml - .get("apiVersion") - .expect("couldn't get apiVersion from YAML") - .as_str() - .expect("couldn't get apiVersion as str"); - let kind = yaml - .get("kind") - .expect("couldn't get kind from YAML") - .as_str() - .expect("couldn't get kind as str"); - - let mut it = api_version.splitn(2, '/'); - let first = it.next().unwrap(); - let (g, v) = match it.next() { - Some(second) => (first, second), - None => ("", first), - }; - - let gvk = GroupVersionKind::gvk(g, v, kind); - let api_resource = ApiResource::from_gvk(&gvk); - - let namespace = match ns { - Some(n) => n, - None => obj - .metadata - .namespace - .as_ref() - .expect("YAML must have a namespace"), - }; - - // 5. Create a dynamic API client for this resource type. - let api: Api = - Api::namespaced_with(self.client.clone(), namespace, &api_resource); - - // 6. Apply the object to the cluster using Server-Side Apply. - // This will create the resource if it doesn't exist, or update it if it does. - println!("Applying '{name}' in namespace '{namespace}'...",); - let patch_params = PatchParams::apply("harmony"); // Use a unique field manager name - let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?; - - println!("Successfully applied '{}'.", result.name_any()); - - Ok(()) - } - - /// Apply a resource from a URL - /// - /// It is the equivalent of `kubectl apply -f ` - pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> { - let patch_params = PatchParams::apply("harmony"); - let discovery = kube::Discovery::new(self.client.clone()).run().await?; - - let yaml = reqwest::get(url) - .await - .expect("Could not get URL") - .text() - .await - .expect("Could not get content from URL"); - - for doc in multidoc_deserialize(&yaml).expect("failed to parse YAML from file") { - let obj: DynamicObject = - serde_yaml::from_value(doc).expect("cannot apply without valid YAML"); - let namespace = obj.metadata.namespace.as_deref().or(ns); - let type_meta = obj - .types - .as_ref() - .expect("cannot apply object without valid TypeMeta"); - let gvk = GroupVersionKind::try_from(type_meta) - .expect("cannot apply object without valid GroupVersionKind"); - let name = obj.name_any(); - - if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) { - let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false); - trace!( - "Applying {}: \n{}", - gvk.kind, - serde_yaml::to_string(&obj).expect("Failed to serialize YAML") - ); - let data: serde_json::Value = - serde_json::to_value(&obj).expect("Failed to serialize JSON"); - let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?; - debug!("applied {} {}", gvk.kind, name); - } else { - warn!("Cannot apply document for unknown {gvk:?}"); - } - } - - Ok(()) - } - - /// Gets a single named resource of a specific type `K`. - /// - /// This function uses the `ApplyStrategy` trait to correctly determine - /// whether to look in a specific namespace or in the entire cluster. - /// - /// Returns `Ok(None)` if the resource is not found (404). - pub async fn get_resource( - &self, - name: &str, - namespace: Option<&str>, - ) -> Result, Error> - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned, - ::Scope: ApplyStrategy, - ::DynamicType: Default, - { - let api: Api = - <::Scope as ApplyStrategy>::get_api(&self.client, namespace); - - api.get_opt(name).await - } - - pub async fn list_all_resources_with_labels(&self, labels: &str) -> Result, Error> - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned, - ::DynamicType: Default, - { - let api: Api = Api::all(self.client.clone()); - - let lp = ListParams::default().labels(labels); - Ok(api.list(&lp).await?.items) - } - - pub async fn get_all_resource_in_all_namespace(&self) -> Result, Error> - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned, - ::Scope: ApplyStrategy, - ::DynamicType: Default, - { - let api: Api = Api::all(self.client.clone()); - Ok(api.list(&Default::default()).await?.items) - } - - /// Lists all resources of a specific type `K`. - /// - /// This function uses the `ApplyStrategy` trait to correctly determine - /// whether to list from a specific namespace or from the entire cluster. - pub async fn list_resources( - &self, - namespace: Option<&str>, - list_params: Option, - ) -> Result, Error> - where - K: Resource + Clone + std::fmt::Debug + DeserializeOwned, - ::Scope: ApplyStrategy, - ::DynamicType: Default, - { - let api: Api = - <::Scope as ApplyStrategy>::get_api(&self.client, namespace); - - let list_params = list_params.unwrap_or_default(); - api.list(&list_params).await - } - - /// Fetches a list of all Nodes in the cluster. - pub async fn get_nodes( - &self, - list_params: Option, - ) -> Result, Error> { - self.list_resources(None, list_params).await - } - - pub async fn from_kubeconfig(path: &str) -> Option { - Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await - } - - pub async fn from_kubeconfig_with_context( - path: &str, - context: Option, - ) -> Option { - let mut opts = KubeConfigOptions::default(); - opts.context = context; - - Self::from_kubeconfig_with_opts(path, &opts).await - } - - pub async fn from_kubeconfig_with_opts( - path: &str, - opts: &KubeConfigOptions, - ) -> Option { - let k = match Kubeconfig::read_from(path) { - Ok(k) => k, - Err(e) => { - error!("Failed to load kubeconfig from {path} : {e}"); - return None; - } - }; - - Some(K8sClient::new( - Client::try_from(Config::from_custom_kubeconfig(k, &opts).await.unwrap()).unwrap(), - )) - } -} - -fn get_dynamic_api( - resource: ApiResource, - capabilities: ApiCapabilities, - client: Client, - ns: Option<&str>, - all: bool, -) -> Api { - if capabilities.scope == Scope::Cluster || all { - Api::all_with(client, &resource) - } else if let Some(namespace) = ns { - Api::namespaced_with(client, namespace, &resource) - } else { - Api::default_namespaced_with(client, &resource) - } -} - -fn multidoc_deserialize(data: &str) -> Result, serde_yaml::Error> { - use serde::Deserialize; - let mut docs = vec![]; - for de in serde_yaml::Deserializer::from_str(data) { - docs.push(serde_yaml::Value::deserialize(de)?); - } - Ok(docs) -} - -pub trait ApplyStrategy { - fn get_api(client: &Client, ns: Option<&str>) -> Api; -} - -/// Implementation for all resources that are cluster-scoped. -/// It will always use `Api::all` and ignore the namespace parameter. -impl ApplyStrategy for ClusterResourceScope -where - K: Resource, - ::DynamicType: Default, -{ - fn get_api(client: &Client, _ns: Option<&str>) -> Api { - Api::all(client.clone()) - } -} - -/// Implementation for all resources that are namespace-scoped. -/// It will use `Api::namespaced` if a namespace is provided, otherwise -/// it falls back to the default namespace configured in your kubeconfig. -impl ApplyStrategy for NamespaceResourceScope -where - K: Resource, - ::DynamicType: Default, -{ - fn get_api(client: &Client, ns: Option<&str>) -> Api { - match ns { - Some(ns) => Api::namespaced(client.clone(), ns), - None => Api::default_namespaced(client.clone()), - } - } -} diff --git a/harmony/src/domain/topology/k8s/bundle.rs b/harmony/src/domain/topology/k8s/bundle.rs new file mode 100644 index 00000000..d8262014 --- /dev/null +++ b/harmony/src/domain/topology/k8s/bundle.rs @@ -0,0 +1,133 @@ +//! Resource Bundle Pattern Implementation +//! +//! This module implements the Resource Bundle pattern for managing groups of +//! Kubernetes resources that form a logical unit of work. +//! +//! ## Purpose +//! +//! The ResourceBundle pattern addresses the need to manage ephemeral privileged +//! pods along with their platform-specific security requirements (e.g., OpenShift +//! Security Context Constraints). +//! +//! ## Use Cases +//! +//! - Writing files to node filesystems (e.g., NetworkManager configurations for +//! network bonding as described in ADR-019) +//! - Running privileged commands on nodes (e.g., reboots, system configuration) +//! +//! ## Benefits +//! +//! - **Separation of Concerns**: Client code doesn't need to know about +//! platform-specific RBAC requirements +//! - **Atomic Operations**: Resources are applied and deleted as a unit +//! - **Clean Abstractions**: Privileged operations are encapsulated in bundles +//! rather than scattered throughout client methods +//! +//! ## Example +//! +//! ```rust,no_run +//! use harmony::topology::k8s::{K8sClient, helper}; +//! use harmony::topology::KubernetesDistribution; +//! +//! async fn write_network_config(client: &K8sClient, node: &str) { +//! // Create a bundle with platform-specific RBAC +//! let bundle = helper::build_privileged_bundle( +//! helper::PrivilegedPodConfig { +//! name: "network-config".to_string(), +//! namespace: "default".to_string(), +//! node_name: node.to_string(), +//! // ... other config +//! ..Default::default() +//! }, +//! &KubernetesDistribution::OpenshiftFamily, +//! ); +//! +//! // Apply all resources (RBAC + Pod) atomically +//! bundle.apply(client).await.unwrap(); +//! +//! // ... wait for completion ... +//! +//! // Cleanup all resources +//! bundle.delete(client).await.unwrap(); +//! } +//! ``` + +use kube::{Error, Resource, ResourceExt, api::DynamicObject}; +use serde::Serialize; +use serde_json; + +use crate::domain::topology::k8s::K8sClient; + +/// A ResourceBundle represents a logical unit of work consisting of multiple +/// Kubernetes resources that should be applied or deleted together. +/// +/// This pattern is useful for managing ephemeral privileged pods along with +/// their required RBAC bindings (e.g., OpenShift SCC bindings). +#[derive(Debug)] +pub struct ResourceBundle { + pub resources: Vec, +} + +impl ResourceBundle { + pub fn new() -> Self { + Self { + resources: Vec::new(), + } + } + + /// Add a Kubernetes resource to this bundle. + /// The resource is converted to a DynamicObject for generic handling. + pub fn add(&mut self, resource: K) + where + K: Resource + Serialize, + ::DynamicType: Default, + { + // Convert the typed resource to JSON, then to DynamicObject + let json = serde_json::to_value(&resource).expect("Failed to serialize resource"); + let mut obj: DynamicObject = + serde_json::from_value(json).expect("Failed to convert to DynamicObject"); + + // Ensure type metadata is set + if obj.types.is_none() { + let api_version = Default::default(); + let kind = Default::default(); + let gvk = K::api_version(&api_version); + let kind = K::kind(&kind); + obj.types = Some(kube::api::TypeMeta { + api_version: gvk.to_string(), + kind: kind.to_string(), + }); + } + + self.resources.push(obj); + } + + /// Apply all resources in this bundle to the cluster. + /// Resources are applied in the order they were added. + pub async fn apply(&self, client: &K8sClient) -> Result<(), Error> { + for res in &self.resources { + let namespace = res.namespace(); + client + .apply_dynamic(res, namespace.as_deref(), true) + .await?; + } + Ok(()) + } + + /// Delete all resources in this bundle from the cluster. + /// Resources are deleted in reverse order to respect dependencies. + pub async fn delete(&self, client: &K8sClient) -> Result<(), Error> { + // FIXME delete all in parallel and retry using kube::client::retry::RetryPolicy + for res in self.resources.iter().rev() { + let api = client.get_api_for_dynamic_object(res, res.namespace().as_deref())?; + let name = res.name_any(); + // FIXME this swallows all errors. Swallowing a 404 is ok but other errors must be + // handled properly (such as retrying). A normal error case is when we delete a + // resource bundle with dependencies between various resources. Such as a pod with a + // dependency on a ClusterRoleBinding. Trying to delete the ClusterRoleBinding first + // is expected to fail + let _ = api.delete(&name, &kube::api::DeleteParams::default()).await; + } + Ok(()) + } +} diff --git a/harmony/src/domain/topology/k8s/config.rs b/harmony/src/domain/topology/k8s/config.rs new file mode 100644 index 00000000..57cc3f48 --- /dev/null +++ b/harmony/src/domain/topology/k8s/config.rs @@ -0,0 +1 @@ +pub const PRIVILEGED_POD_IMAGE: &str = "hub.nationtech.io/redhat/ubi10:latest"; diff --git a/harmony/src/domain/topology/k8s/helper.rs b/harmony/src/domain/topology/k8s/helper.rs new file mode 100644 index 00000000..d5944ec2 --- /dev/null +++ b/harmony/src/domain/topology/k8s/helper.rs @@ -0,0 +1,601 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use crate::topology::KubernetesDistribution; + +use super::bundle::ResourceBundle; +use super::config::PRIVILEGED_POD_IMAGE; +use k8s_openapi::api::core::v1::{ + Container, HostPathVolumeSource, Pod, PodSpec, SecurityContext, Volume, VolumeMount, +}; +use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use kube::error::DiscoveryError; +use log::{debug, error, info, warn}; + +#[derive(Debug)] +pub struct PrivilegedPodConfig { + pub name: String, + pub namespace: String, + pub node_name: String, + pub container_name: String, + pub command: Vec, + pub volumes: Vec, + pub volume_mounts: Vec, + pub host_pid: bool, + pub host_network: bool, +} + +impl Default for PrivilegedPodConfig { + fn default() -> Self { + Self { + name: "privileged-pod".to_string(), + namespace: "harmony".to_string(), + node_name: "".to_string(), + container_name: "privileged-container".to_string(), + command: vec![], + volumes: vec![], + volume_mounts: vec![], + host_pid: false, + host_network: false, + } + } +} + +pub fn build_privileged_pod( + config: PrivilegedPodConfig, + k8s_distribution: &KubernetesDistribution, +) -> Pod { + let annotations = match k8s_distribution { + KubernetesDistribution::OpenshiftFamily => Some(BTreeMap::from([ + ("openshift.io/scc".to_string(), "privileged".to_string()), + ( + "openshift.io/required-scc".to_string(), + "privileged".to_string(), + ), + ])), + _ => None, + }; + + Pod { + metadata: ObjectMeta { + name: Some(config.name), + namespace: Some(config.namespace), + annotations, + ..Default::default() + }, + spec: Some(PodSpec { + node_name: Some(config.node_name), + restart_policy: Some("Never".to_string()), + host_pid: Some(config.host_pid), + host_network: Some(config.host_network), + containers: vec![Container { + name: config.container_name, + image: Some(PRIVILEGED_POD_IMAGE.to_string()), + command: Some(config.command), + security_context: Some(SecurityContext { + privileged: Some(true), + ..Default::default() + }), + volume_mounts: Some(config.volume_mounts), + ..Default::default() + }], + volumes: Some(config.volumes), + ..Default::default() + }), + ..Default::default() + } +} + +pub fn host_root_volume() -> (Volume, VolumeMount) { + ( + Volume { + name: "host".to_string(), + host_path: Some(HostPathVolumeSource { + path: "/".to_string(), + ..Default::default() + }), + ..Default::default() + }, + VolumeMount { + name: "host".to_string(), + mount_path: "/host".to_string(), + ..Default::default() + }, + ) +} + +/// Build a ResourceBundle containing a privileged pod and any required RBAC. +/// +/// This function implements the Resource Bundle pattern to encapsulate platform-specific +/// security requirements for running privileged operations on nodes. +/// +/// # Platform-Specific Behavior +/// +/// - **OpenShift**: Creates a ClusterRoleBinding to grant the default ServiceAccount +/// access to the `system:openshift:scc:privileged` ClusterRole, which allows the pod +/// to use the privileged Security Context Constraint (SCC). +/// - **Standard Kubernetes/K3s**: Only creates the Pod resource, as these distributions +/// use standard PodSecurityPolicy or don't enforce additional security constraints. +/// +/// # Arguments +/// +/// * `config` - Configuration for the privileged pod (name, namespace, command, etc.) +/// * `k8s_distribution` - The detected Kubernetes distribution to determine RBAC requirements +/// +/// # Returns +/// +/// A `ResourceBundle` containing 1-2 resources: +/// - ClusterRoleBinding (OpenShift only) +/// - Pod (all distributions) +/// +/// # Example +/// +/// ```rust,no_run +/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig}; +/// # use harmony::topology::KubernetesDistribution; +/// let bundle = build_privileged_bundle( +/// PrivilegedPodConfig { +/// name: "network-setup".to_string(), +/// namespace: "default".to_string(), +/// node_name: "worker-01".to_string(), +/// container_name: "setup".to_string(), +/// command: vec!["nmcli".to_string(), "connection".to_string(), "reload".to_string()], +/// ..Default::default() +/// }, +/// &KubernetesDistribution::OpenshiftFamily, +/// ); +/// // Bundle now contains ClusterRoleBinding + Pod +/// ``` +pub fn build_privileged_bundle( + config: PrivilegedPodConfig, + k8s_distribution: &KubernetesDistribution, +) -> ResourceBundle { + debug!( + "Building privileged bundle for config {config:#?} on distribution {k8s_distribution:?}" + ); + let mut bundle = ResourceBundle::new(); + let pod_name = config.name.clone(); + let namespace = config.namespace.clone(); + + // 1. On OpenShift, create RBAC binding to privileged SCC + if let KubernetesDistribution::OpenshiftFamily = k8s_distribution { + // The default ServiceAccount needs to be bound to the privileged SCC + // via the system:openshift:scc:privileged ClusterRole + let crb = ClusterRoleBinding { + metadata: ObjectMeta { + name: Some(format!("{}-scc-binding", pod_name)), + ..Default::default() + }, + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".to_string(), + kind: "ClusterRole".to_string(), + name: "system:openshift:scc:privileged".to_string(), + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".to_string(), + name: "default".to_string(), + namespace: Some(namespace.clone()), + api_group: None, + ..Default::default() + }]), + }; + bundle.add(crb); + } + + // 2. Build the privileged pod + let pod = build_privileged_pod(config, k8s_distribution); + bundle.add(pod); + + bundle +} + +/// Action to take when a drain operation times out. +pub enum DrainTimeoutAction { + /// Accept the partial drain and continue + Accept, + /// Retry the drain for another timeout period + Retry, + /// Abort the drain operation + Abort, +} + +/// Prompts the user to confirm acceptance of a partial drain. +/// +/// Returns `Ok(true)` if the user confirms acceptance, `Ok(false)` if the user +/// chooses to retry or abort, and `Err` if the prompt system fails entirely. +pub fn prompt_drain_timeout_action( + node_name: &str, + pending_count: usize, + timeout_duration: Duration, +) -> Result { + let prompt_msg = format!( + "Drain operation timed out on node '{}' with {} pod(s) remaining. What would you like to do?", + node_name, pending_count + ); + + loop { + let choices = vec![ + "Accept drain failure (requires confirmation)".to_string(), + format!("Retry drain for another {:?}", timeout_duration), + "Abort operation".to_string(), + ]; + + let selection = inquire::Select::new(&prompt_msg, choices) + .with_help_message("Use arrow keys to navigate, Enter to select") + .prompt() + .map_err(|e| { + kube::Error::Discovery(DiscoveryError::MissingResource(format!( + "Prompt failed: {}", + e + ))) + })?; + + if selection.starts_with("Accept") { + // Require typed confirmation - retry until correct or user cancels + let required_confirmation = format!("yes-accept-drain:{}={}", node_name, pending_count); + + let confirmation_prompt = format!( + "To accept this partial drain, type exactly: {}", + required_confirmation + ); + + match inquire::Text::new(&confirmation_prompt) + .with_help_message(&format!( + "This action acknowledges {} pods will remain on the node", + pending_count + )) + .prompt() + { + Ok(input) if input == required_confirmation => { + warn!( + "User accepted partial drain of node '{}' with {} pods remaining (confirmation: {})", + node_name, pending_count, required_confirmation + ); + return Ok(DrainTimeoutAction::Accept); + } + Ok(input) => { + warn!( + "Confirmation failed. Expected '{}', got '{}'. Please try again.", + required_confirmation, input + ); + } + Err(e) => { + // User cancelled (Ctrl+C) or prompt system failed + error!("Confirmation prompt cancelled or failed: {}", e); + return Ok(DrainTimeoutAction::Abort); + } + } + } else if selection.starts_with("Retry") { + info!( + "User chose to retry drain operation for another {:?}", + timeout_duration + ); + return Ok(DrainTimeoutAction::Retry); + } else { + error!("Drain operation aborted by user"); + return Ok(DrainTimeoutAction::Abort); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn test_host_root_volume() { + let (volume, mount) = host_root_volume(); + + assert_eq!(volume.name, "host"); + assert_eq!(volume.host_path.as_ref().unwrap().path, "/"); + + assert_eq!(mount.name, "host"); + assert_eq!(mount.mount_path, "/host"); + } + + #[test] + fn test_build_privileged_pod_minimal() { + let pod = build_privileged_pod( + PrivilegedPodConfig { + name: "minimal-pod".to_string(), + namespace: "kube-system".to_string(), + node_name: "node-123".to_string(), + container_name: "debug-container".to_string(), + command: vec!["sleep".to_string(), "3600".to_string()], + ..Default::default() + }, + &KubernetesDistribution::Default, + ); + + assert_eq!(pod.metadata.name, Some("minimal-pod".to_string())); + assert_eq!(pod.metadata.namespace, Some("kube-system".to_string())); + + let spec = pod.spec.as_ref().expect("Pod spec should be present"); + assert_eq!(spec.node_name, Some("node-123".to_string())); + assert_eq!(spec.restart_policy, Some("Never".to_string())); + assert_eq!(spec.host_pid, Some(false)); + assert_eq!(spec.host_network, Some(false)); + + assert_eq!(spec.containers.len(), 1); + let container = &spec.containers[0]; + assert_eq!(container.name, "debug-container"); + assert_eq!(container.image, Some(PRIVILEGED_POD_IMAGE.to_string())); + assert_eq!( + container.command, + Some(vec!["sleep".to_string(), "3600".to_string()]) + ); + + // Security context check + let sec_ctx = container + .security_context + .as_ref() + .expect("Security context missing"); + assert_eq!(sec_ctx.privileged, Some(true)); + } + + #[test] + fn test_build_privileged_pod_with_volumes_and_host_access() { + let (host_vol, host_mount) = host_root_volume(); + + let pod = build_privileged_pod( + PrivilegedPodConfig { + name: "full-pod".to_string(), + namespace: "default".to_string(), + node_name: "node-1".to_string(), + container_name: "runner".to_string(), + command: vec!["/bin/sh".to_string()], + volumes: vec![host_vol.clone()], + volume_mounts: vec![host_mount.clone()], + host_pid: true, + host_network: true, + }, + &KubernetesDistribution::Default, + ); + + let spec = pod.spec.as_ref().expect("Pod spec should be present"); + assert_eq!(spec.host_pid, Some(true)); + assert_eq!(spec.host_network, Some(true)); + + // Check volumes in Spec + let volumes = spec.volumes.as_ref().expect("Volumes should be present"); + assert_eq!(volumes.len(), 1); + assert_eq!(volumes[0].name, "host"); + + // Check mounts in Container + let container = &spec.containers[0]; + let mounts = container + .volume_mounts + .as_ref() + .expect("Mounts should be present"); + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].name, "host"); + assert_eq!(mounts[0].mount_path, "/host"); + } + + #[test] + fn test_build_privileged_pod_structure_correctness() { + // This test validates that the construction logic puts things in the right places + // effectively validating the "template". + + let custom_vol = Volume { + name: "custom-vol".to_string(), + ..Default::default() + }; + let custom_mount = VolumeMount { + name: "custom-vol".to_string(), + mount_path: "/custom".to_string(), + ..Default::default() + }; + + let pod = build_privileged_pod( + PrivilegedPodConfig { + name: "structure-test".to_string(), + namespace: "test-ns".to_string(), + node_name: "test-node".to_string(), + container_name: "test-container".to_string(), + command: vec!["cmd".to_string()], + volumes: vec![custom_vol], + volume_mounts: vec![custom_mount], + ..Default::default() + }, + &KubernetesDistribution::Default, + ); + + // Validate structure depth + let spec = pod.spec.as_ref().unwrap(); + + // 1. Spec level fields + assert!(spec.node_name.is_some()); + assert!(spec.volumes.is_some()); + + // 2. Container level fields + let container = &spec.containers[0]; + assert!(container.security_context.is_some()); + assert!(container.volume_mounts.is_some()); + + // 3. Nested fields + assert!( + container + .security_context + .as_ref() + .unwrap() + .privileged + .unwrap() + ); + assert_eq!(spec.volumes.as_ref().unwrap()[0].name, "custom-vol"); + assert_eq!( + container.volume_mounts.as_ref().unwrap()[0].mount_path, + "/custom" + ); + } + + #[test] + fn test_build_privileged_bundle_default_distribution() { + let bundle = build_privileged_bundle( + PrivilegedPodConfig { + name: "test-bundle".to_string(), + namespace: "test-ns".to_string(), + node_name: "node-1".to_string(), + container_name: "test-container".to_string(), + command: vec!["echo".to_string(), "hello".to_string()], + ..Default::default() + }, + &KubernetesDistribution::Default, + ); + + // For Default distribution, only the Pod should be in the bundle + assert_eq!(bundle.resources.len(), 1); + + let pod_obj = &bundle.resources[0]; + assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle")); + assert_eq!(pod_obj.metadata.namespace.as_deref(), Some("test-ns")); + } + + #[test] + fn test_build_privileged_bundle_openshift_distribution() { + let bundle = build_privileged_bundle( + PrivilegedPodConfig { + name: "test-bundle-ocp".to_string(), + namespace: "test-ns".to_string(), + node_name: "node-1".to_string(), + container_name: "test-container".to_string(), + command: vec!["echo".to_string(), "hello".to_string()], + ..Default::default() + }, + &KubernetesDistribution::OpenshiftFamily, + ); + + // For OpenShift, both ClusterRoleBinding and Pod should be in the bundle + assert_eq!(bundle.resources.len(), 2); + + // First resource should be the ClusterRoleBinding + let crb_obj = &bundle.resources[0]; + assert_eq!( + crb_obj.metadata.name.as_deref(), + Some("test-bundle-ocp-scc-binding") + ); + + // Verify it's targeting the privileged SCC + if let Some(role_ref) = crb_obj.data.get("roleRef") { + assert_eq!( + role_ref.get("name").and_then(|v| v.as_str()), + Some("system:openshift:scc:privileged") + ); + } + + // Second resource should be the Pod + let pod_obj = &bundle.resources[1]; + assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle-ocp")); + assert_eq!(pod_obj.metadata.namespace.as_deref(), Some("test-ns")); + } + + #[test] + fn test_build_privileged_bundle_k3s_distribution() { + let bundle = build_privileged_bundle( + PrivilegedPodConfig { + name: "test-bundle-k3s".to_string(), + namespace: "test-ns".to_string(), + node_name: "node-1".to_string(), + container_name: "test-container".to_string(), + command: vec!["echo".to_string(), "hello".to_string()], + ..Default::default() + }, + &KubernetesDistribution::K3sFamily, + ); + + // For K3s, only the Pod should be in the bundle (no special SCC) + assert_eq!(bundle.resources.len(), 1); + + let pod_obj = &bundle.resources[0]; + assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle-k3s")); + } + + #[test] + fn test_pod_yaml_rendering_expected() { + let pod = build_privileged_pod( + PrivilegedPodConfig { + name: "pod_name".to_string(), + namespace: "pod_namespace".to_string(), + node_name: "node name".to_string(), + container_name: "container name".to_string(), + command: vec!["command".to_string(), "argument".to_string()], + host_pid: true, + host_network: true, + ..Default::default() + }, + &KubernetesDistribution::Default, + ); + + assert_eq!( + &serde_yaml::to_string(&pod).unwrap(), + "apiVersion: v1 +kind: Pod +metadata: + name: pod_name + namespace: pod_namespace +spec: + containers: + - command: + - command + - argument + image: hub.nationtech.io/redhat/ubi10:latest + name: container name + securityContext: + privileged: true + volumeMounts: [] + hostNetwork: true + hostPID: true + nodeName: node name + restartPolicy: Never + volumes: [] +" + ); + } + + #[test] + fn test_pod_yaml_rendering_openshift() { + let pod = build_privileged_pod( + PrivilegedPodConfig { + name: "pod_name".to_string(), + namespace: "pod_namespace".to_string(), + node_name: "node name".to_string(), + container_name: "container name".to_string(), + command: vec!["command".to_string(), "argument".to_string()], + host_pid: true, + host_network: true, + ..Default::default() + }, + &KubernetesDistribution::OpenshiftFamily, + ); + + assert_eq!( + &serde_yaml::to_string(&pod).unwrap(), + "apiVersion: v1 +kind: Pod +metadata: + annotations: + openshift.io/required-scc: privileged + openshift.io/scc: privileged + name: pod_name + namespace: pod_namespace +spec: + containers: + - command: + - command + - argument + image: hub.nationtech.io/redhat/ubi10:latest + name: container name + securityContext: + privileged: true + volumeMounts: [] + hostNetwork: true + hostPID: true + nodeName: node name + restartPolicy: Never + volumes: [] +" + ); + } +} diff --git a/harmony/src/domain/topology/k8s/mod.rs b/harmony/src/domain/topology/k8s/mod.rs new file mode 100644 index 00000000..b12a9ed5 --- /dev/null +++ b/harmony/src/domain/topology/k8s/mod.rs @@ -0,0 +1,2586 @@ +pub mod bundle; +pub mod config; +pub mod helper; + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use k8s_openapi::{ + ClusterResourceScope, NamespaceResourceScope, + api::{ + apps::v1::Deployment, + core::v1::{ + ConfigMap, ConfigMapVolumeSource, Node, Pod, ServiceAccount, Volume, VolumeMount, + }, + }, + apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition, + apimachinery::pkg::{apis::meta::v1::ObjectMeta, version::Info}, +}; +use kube::{ + Client, Config, Discovery, Error, Resource, + api::{ + Api, AttachParams, DeleteParams, EvictParams, ListParams, ObjectList, Patch, PatchParams, + PostParams, ResourceExt, + }, + config::{KubeConfigOptions, Kubeconfig}, + core::ErrorResponse, + discovery::{ApiCapabilities, Scope}, + error::DiscoveryError, + runtime::reflector::Lookup, +}; +use kube::{api::DynamicObject, runtime::conditions}; +use kube::{ + api::{ApiResource, GroupVersionKind}, + runtime::wait::await_condition, +}; +use log::{debug, error, info, trace, warn}; +use serde::{Serialize, de::DeserializeOwned}; +use serde_json::{Value, json}; +use similar::TextDiff; +use tokio::{ + io::AsyncReadExt, + sync::{Mutex, OnceCell}, + time::sleep, +}; +use tokio_retry::{Retry, strategy::ExponentialBackoff}; +use url::Url; + +use crate::topology::{KubernetesDistribution, k8s::helper::PrivilegedPodConfig}; + +#[derive(Clone)] +pub struct K8sClient { + client: Client, + k8s_distribution: Arc>, + discovery: Arc>, +} + +impl Serialize for K8sClient { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl std::fmt::Debug for K8sClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // This is a poor man's debug implementation for now as kube::Client does not provide much + // useful information + f.write_fmt(format_args!( + "K8sClient {{ kube client using default namespace {} }}", + self.client.default_namespace() + )) + } +} + +/// A file to be written to a node's filesystem. +#[derive(Debug, Clone)] +pub struct NodeFile { + /// The absolute path on the host where the file should be written. + pub path: String, + /// The content of the file. + pub content: String, + /// The file permissions (e.g. 0o600). + pub mode: u32, +} + +/// Options controlling the behavior of a [`K8sClient::drain_node`] operation. +#[derive(Debug, Clone)] +pub struct DrainOptions { + /// If `true`, pods that use `emptyDir` volumes will be evicted (their + /// ephemeral data is lost). Equivalent to `kubectl drain + /// --delete-emptydir-data`. + pub delete_emptydir_data: bool, + /// If `true`, DaemonSet-managed pods are silently skipped instead of + /// blocking the drain. Equivalent to `kubectl drain --ignore-daemonsets`. + pub ignore_daemonsets: bool, + /// Maximum wall-clock time to wait for all evictions to complete before + /// returning an error. + pub timeout: Duration, +} + +impl Default for DrainOptions { + fn default() -> Self { + Self { + delete_emptydir_data: false, + ignore_daemonsets: true, + // TODO sane timeout + timeout: Duration::from_secs(1), + } + } +} + +impl DrainOptions { + pub fn default_ignore_daemonset_delete_emptydir_data() -> DrainOptions { + let mut drain_opts = DrainOptions::default(); + drain_opts.delete_emptydir_data = true; + drain_opts.ignore_daemonsets = true; + drain_opts + } +} + +impl K8sClient { + pub fn new(client: Client) -> Self { + Self { + client, + k8s_distribution: Arc::new(OnceCell::new()), + discovery: Arc::new(OnceCell::new()), + } + } + + pub async fn try_default() -> Result { + let client = Self { + client: Client::try_default().await?, + k8s_distribution: Arc::new(OnceCell::new()), + discovery: Arc::new(OnceCell::new()), + }; + + Ok(client) + } + + /// Returns true if any deployment in the given namespace matching the label selector + /// has status.availableReplicas > 0 (or condition Available=True). + pub async fn has_healthy_deployment_with_label( + &self, + namespace: &str, + label_selector: &str, + ) -> Result { + let api: Api = Api::namespaced(self.client.clone(), namespace); + let lp = ListParams::default().labels(label_selector); + let list = api.list(&lp).await?; + for d in list.items { + // Check AvailableReplicas > 0 or Available condition + let available = d + .status + .as_ref() + .and_then(|s| s.available_replicas) + .unwrap_or(0); + if available > 0 { + return Ok(true); + } + // Fallback: scan conditions + if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) { + if conds + .iter() + .any(|c| c.type_ == "Available" && c.status == "True") + { + return Ok(true); + } + } + } + Ok(false) + } + + /// Cluster-wide: returns namespaces that have at least one healthy deployment + /// matching the label selector (equivalent to kubectl -A -l ...). + pub async fn list_namespaces_with_healthy_deployments( + &self, + label_selector: &str, + ) -> Result, Error> { + let api: Api = Api::all(self.client.clone()); + let lp = ListParams::default().labels(label_selector); + let list = api.list(&lp).await?; + + let mut healthy_ns: HashMap = HashMap::new(); + for d in list.items { + let ns = match d.metadata.namespace.clone() { + Some(n) => n, + None => continue, + }; + let available = d + .status + .as_ref() + .and_then(|s| s.available_replicas) + .unwrap_or(0); + let is_healthy = if available > 0 { + true + } else { + d.status + .as_ref() + .and_then(|s| s.conditions.as_ref()) + .map(|conds| { + conds + .iter() + .any(|c| c.type_ == "Available" && c.status == "True") + }) + .unwrap_or(false) + }; + if is_healthy { + healthy_ns.insert(ns, true); + } + } + + Ok(healthy_ns.into_keys().collect()) + } + + /// Get the application-controller ServiceAccount name (fallback to default) + pub async fn get_controller_service_account_name( + &self, + ns: &str, + ) -> Result, Error> { + let api: Api = Api::namespaced(self.client.clone(), ns); + let lp = ListParams::default().labels("app.kubernetes.io/component=controller"); + let list = api.list(&lp).await?; + if let Some(dep) = list.items.get(0) { + if let Some(sa) = dep + .spec + .as_ref() + .and_then(|ds| ds.template.spec.as_ref()) + .and_then(|ps| ps.service_account_name.clone()) + { + return Ok(Some(sa)); + } + } + Ok(None) + } + + // List ClusterRoleBindings dynamically and return as JSON values + pub async fn list_clusterrolebindings_json(&self) -> Result, Error> { + let gvk = kube::api::GroupVersionKind::gvk( + "rbac.authorization.k8s.io", + "v1", + "ClusterRoleBinding", + ); + let ar = kube::api::ApiResource::from_gvk(&gvk); + let api: Api = Api::all_with(self.client.clone(), &ar); + let crbs = api.list(&ListParams::default()).await?; + let mut out = Vec::new(); + for o in crbs { + let v = serde_json::to_value(&o).unwrap_or(Value::Null); + out.push(v); + } + Ok(out) + } + + /// Determine if Argo controller in ns has cluster-wide permissions via CRBs + // TODO This does not belong in the generic k8s client, should be refactored at some point + pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result { + let crbs = self.list_clusterrolebindings_json().await?; + let sa_user = format!("system:serviceaccount:{}:{}", ns, sa); + for crb in crbs { + if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) { + for subj in subjects { + let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or(""); + let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or(""); + let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or(""); + if (kind == "ServiceAccount" && name == sa && subj_ns == ns) + || (kind == "User" && name == sa_user) + { + return Ok(true); + } + } + } + } + Ok(false) + } + + pub async fn has_crd(&self, name: &str) -> Result { + let api: Api = Api::all(self.client.clone()); + let lp = ListParams::default().fields(&format!("metadata.name={}", name)); + let crds = api.list(&lp).await?; + Ok(!crds.items.is_empty()) + } + + pub async fn service_account_api(&self, namespace: &str) -> Api { + let api: Api = Api::namespaced(self.client.clone(), namespace); + api + } + + pub async fn get_apiserver_version(&self) -> Result { + let client: Client = self.client.clone(); + let version_info: Info = client.apiserver_version().await?; + Ok(version_info) + } + + pub async fn discovery(&self) -> Result<&Discovery, Error> { + // Retry with exponential backoff in case of API server load + let retry_strategy = ExponentialBackoff::from_millis(1000) + .max_delay(Duration::from_secs(32)) + .take(6); + + let attempt = Mutex::new(0); + Retry::spawn(retry_strategy, || async { + let mut alock = attempt.lock().await; + *alock += 1; + match self + .discovery + .get_or_try_init(async || { + debug!("Running Kubernetes API discovery (attempt {})", *alock); + let discovery = Discovery::new(self.client.clone()).run().await?; + debug!("Kubernetes API discovery completed"); + Ok(discovery) + }) + .await + { + Ok(discovery) => Ok(discovery), + Err(e) => { + warn!( + "Kubernetes API discovery failed (attempt {}): {}", + *alock, e + ); + Err(e) + } + } + }) + .await + .map_err(|e| { + error!("Kubernetes API discovery failed after all retries: {}", e); + e + }) + } + + pub async fn get_resource_json_value( + &self, + name: &str, + namespace: Option<&str>, + gvk: &GroupVersionKind, + ) -> Result { + let gvk = ApiResource::from_gvk(gvk); + let resource: Api = if let Some(ns) = namespace { + Api::namespaced_with(self.client.clone(), ns, &gvk) + } else { + Api::default_namespaced_with(self.client.clone(), &gvk) + }; + + resource.get(name).await + } + + pub async fn get_secret_json_value( + &self, + name: &str, + namespace: Option<&str>, + ) -> Result { + self.get_resource_json_value( + name, + namespace, + &GroupVersionKind { + group: "".to_string(), + version: "v1".to_string(), + kind: "Secret".to_string(), + }, + ) + .await + } + + pub async fn get_deployment( + &self, + name: &str, + namespace: Option<&str>, + ) -> Result, Error> { + let deps: Api = if let Some(ns) = namespace { + debug!("getting namespaced deployment"); + Api::namespaced(self.client.clone(), ns) + } else { + debug!("getting default namespace deployment"); + Api::default_namespaced(self.client.clone()) + }; + + debug!("getting deployment {} in ns {}", name, namespace.unwrap()); + deps.get_opt(name).await + } + + pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result, Error> { + let pods: Api = if let Some(ns) = namespace { + Api::namespaced(self.client.clone(), ns) + } else { + Api::default_namespaced(self.client.clone()) + }; + + pods.get_opt(name).await + } + + pub async fn scale_deployment( + &self, + name: &str, + namespace: Option<&str>, + replicas: u32, + ) -> Result<(), Error> { + let deployments: Api = if let Some(ns) = namespace { + Api::namespaced(self.client.clone(), ns) + } else { + Api::default_namespaced(self.client.clone()) + }; + + let patch = json!({ + "spec": { + "replicas": replicas + } + }); + let pp = PatchParams::default(); + let scale = Patch::Merge(&patch); + deployments.patch_scale(name, &pp, &scale).await?; + Ok(()) + } + + pub async fn delete_deployment( + &self, + name: &str, + namespace: Option<&str>, + ) -> Result<(), Error> { + let deployments: Api = if let Some(ns) = namespace { + Api::namespaced(self.client.clone(), ns) + } else { + Api::default_namespaced(self.client.clone()) + }; + let delete_params = DeleteParams::default(); + deployments.delete(name, &delete_params).await?; + Ok(()) + } + + pub async fn wait_until_deployment_ready( + &self, + name: &str, + namespace: Option<&str>, + timeout: Option, + ) -> Result<(), String> { + let api: Api; + + if let Some(ns) = namespace { + api = Api::namespaced(self.client.clone(), ns); + } else { + api = Api::default_namespaced(self.client.clone()); + } + + let establish = await_condition(api, name, conditions::is_deployment_completed()); + let timeout = timeout.unwrap_or(Duration::from_secs(120)); + let res = tokio::time::timeout(timeout, establish).await; + + if res.is_ok() { + Ok(()) + } else { + Err("timed out while waiting for deployment".to_string()) + } + } + + pub async fn wait_for_pod_ready( + &self, + pod_name: &str, + namespace: Option<&str>, + ) -> Result<(), Error> { + let mut elapsed = 0; + let interval = 5; // seconds between checks + let timeout_secs = 120; + loop { + let pod = self.get_pod(pod_name, namespace).await?; + + if let Some(p) = pod { + if let Some(status) = p.status { + if let Some(phase) = status.phase { + if phase.to_lowercase() == "running" { + return Ok(()); + } + } + } + } + + if elapsed >= timeout_secs { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "'{}' in ns '{}' did not become ready within {}s", + pod_name, + namespace.unwrap(), + timeout_secs + )))); + } + + sleep(Duration::from_secs(interval)).await; + elapsed += interval; + } + } + + /// Will execute a commond in the first pod found that matches the specified label + /// '{label}={name}' + pub async fn exec_app_capture_output( + &self, + name: String, + label: String, + namespace: Option<&str>, + command: Vec<&str>, + ) -> Result { + let api: Api; + + if let Some(ns) = namespace { + api = Api::namespaced(self.client.clone(), ns); + } else { + api = Api::default_namespaced(self.client.clone()); + } + let pod_list = api + .list(&ListParams::default().labels(format!("{label}={name}").as_str())) + .await + .expect("couldn't get list of pods"); + + let res = api + .exec( + pod_list + .items + .first() + .expect("couldn't get pod") + .name() + .expect("couldn't get pod name") + .into_owned() + .as_str(), + command, + &AttachParams::default().stdout(true).stderr(true), + ) + .await; + match res { + Err(e) => Err(e.to_string()), + Ok(mut process) => { + let status = process + .take_status() + .expect("Couldn't get status") + .await + .expect("Couldn't unwrap status"); + + if let Some(s) = status.status { + let mut stdout_buf = String::new(); + if let Some(mut stdout) = process.stdout() { + stdout + .read_to_string(&mut stdout_buf) + .await + .map_err(|e| format!("Failed to get status stdout {e}"))?; + } + debug!("Status: {} - {:?}", s, status.details); + if s == "Success" { + Ok(stdout_buf) + } else { + Err(s) + } + } else { + Err("Couldn't get inner status of pod exec".to_string()) + } + } + } + } + + /// Will execute a command in the first pod found that matches the label `app.kubernetes.io/name={name}` + pub async fn exec_app( + &self, + name: String, + namespace: Option<&str>, + command: Vec<&str>, + ) -> Result<(), String> { + let api: Api; + + if let Some(ns) = namespace { + api = Api::namespaced(self.client.clone(), ns); + } else { + api = Api::default_namespaced(self.client.clone()); + } + let pod_list = api + .list(&ListParams::default().labels(format!("app.kubernetes.io/name={name}").as_str())) + .await + .expect("couldn't get list of pods"); + + let res = api + .exec( + pod_list + .items + .first() + .expect("couldn't get pod") + .name() + .expect("couldn't get pod name") + .into_owned() + .as_str(), + command, + &AttachParams::default(), + ) + .await; + + match res { + Err(e) => Err(e.to_string()), + Ok(mut process) => { + let status = process + .take_status() + .expect("Couldn't get status") + .await + .expect("Couldn't unwrap status"); + + if let Some(s) = status.status { + debug!("Status: {} - {:?}", s, status.details); + if s == "Success" { Ok(()) } else { Err(s) } + } else { + Err("Couldn't get inner status of pod exec".to_string()) + } + } + } + } + + pub(crate) fn get_api_for_dynamic_object( + &self, + object: &DynamicObject, + ns: Option<&str>, + ) -> Result, Error> { + let api_resource = object + .types + .as_ref() + .and_then(|t| { + let parts: Vec<&str> = t.api_version.split('/').collect(); + match parts.as_slice() { + [version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk( + "", version, &t.kind, + ))), + [group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk( + group, version, &t.kind, + ))), + _ => None, + } + }) + .ok_or_else(|| { + Error::BuildRequest(kube::core::request::Error::Validation( + "Invalid apiVersion in DynamicObject {object:#?}".to_string(), + )) + })?; + + match ns { + Some(ns) => Ok(Api::namespaced_with(self.client.clone(), ns, &api_resource)), + None => Ok(Api::default_namespaced_with( + self.client.clone(), + &api_resource, + )), + } + } + + pub async fn apply_dynamic_many( + &self, + resource: &[DynamicObject], + namespace: Option<&str>, + force_conflicts: bool, + ) -> Result, Error> { + let mut result = Vec::new(); + for r in resource.iter() { + result.push(self.apply_dynamic(r, namespace, force_conflicts).await?); + } + + Ok(result) + } + + /// Apply DynamicObject resource to the cluster + pub async fn apply_dynamic( + &self, + resource: &DynamicObject, + namespace: Option<&str>, + force_conflicts: bool, + ) -> Result { + // Use discovery to determine the correct API scope + trace!( + "Apply dynamic resource {resource:#?} \n namespace :{namespace:?} force_conflicts {force_conflicts}" + ); + let discovery = self.discovery().await?; + + let type_meta = resource.types.as_ref().ok_or_else(|| { + Error::BuildRequest(kube::core::request::Error::Validation( + "DynamicObject must have types (apiVersion and kind)".to_string(), + )) + })?; + + let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| { + Error::BuildRequest(kube::core::request::Error::Validation(format!( + "Invalid GroupVersionKind in DynamicObject: {:?}", + type_meta + ))) + })?; + + let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| { + Error::Discovery(DiscoveryError::MissingResource(format!( + "Cannot resolve GVK: {:?}", + gvk + ))) + })?; + + // Determine namespace based on resource scope + let effective_namespace = if caps.scope == Scope::Cluster { + None + } else { + namespace.or_else(|| resource.metadata.namespace.as_deref()) + }; + + trace!( + "Discovered information ar {ar:?}, caps {caps:?}, effective_namespace {effective_namespace:?}" + ); + + // Build API using discovered resource and capabilities + let api = get_dynamic_api(ar, caps, self.client.clone(), effective_namespace, false); + let name = resource + .metadata + .name + .as_ref() + .ok_or_else(|| { + Error::BuildRequest(kube::core::request::Error::Validation( + "DynamicObject must have metadata.name".to_string(), + )) + })? + .as_str(); + + debug!( + "Applying dynamic resource kind={:?} apiVersion={:?} name='{}' ns={:?}", + resource.types.as_ref().map(|t| &t.kind), + resource.types.as_ref().map(|t| &t.api_version), + name, + namespace + ); + trace!( + "Dynamic resource payload:\n{:#}", + serde_json::to_value(resource).unwrap_or(serde_json::Value::Null) + ); + + // Using same field manager as in apply() + let mut patch_params = PatchParams::apply("harmony"); + patch_params.force = force_conflicts; + + if *crate::config::DRY_RUN { + // Dry-run path: fetch current, show diff, and return appropriate object + match api.get(name).await { + Ok(current) => { + trace!("Received current dynamic value {current:#?}"); + + println!("\nPerforming dry-run for resource: '{}'", name); + + // Serialize current and new, and strip status from current if present + let mut current_yaml = + serde_yaml::to_value(¤t).unwrap_or_else(|_| serde_yaml::Value::Null); + if let Some(map) = current_yaml.as_mapping_mut() { + if map.contains_key(&serde_yaml::Value::String("status".to_string())) { + let removed = + map.remove(&serde_yaml::Value::String("status".to_string())); + trace!("Removed status from current dynamic object: {:?}", removed); + } else { + trace!( + "Did not find status entry for current dynamic object {}/{}", + current.metadata.namespace.as_deref().unwrap_or(""), + current.metadata.name.as_deref().unwrap_or("") + ); + } + } + + let current_yaml = serde_yaml::to_string(¤t_yaml) + .unwrap_or_else(|_| "Failed to serialize current resource".to_string()); + let new_yaml = serde_yaml::to_string(resource) + .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); + + if current_yaml == new_yaml { + println!("No changes detected."); + return Ok(current); + } + + println!("Changes detected:"); + let diff = TextDiff::from_lines(¤t_yaml, &new_yaml); + for change in diff.iter_all_changes() { + let sign = match change.tag() { + similar::ChangeTag::Delete => "-", + similar::ChangeTag::Insert => "+", + similar::ChangeTag::Equal => " ", + }; + print!("{}{}", sign, change); + } + + // Return the incoming resource as the would-be applied state + Ok(resource.clone()) + } + Err(Error::Api(ErrorResponse { code: 404, .. })) => { + println!("\nPerforming dry-run for new resource: '{}'", name); + println!( + "Resource does not exist. It would be created with the following content:" + ); + let new_yaml = serde_yaml::to_string(resource) + .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); + for line in new_yaml.lines() { + println!("+{}", line); + } + Ok(resource.clone()) + } + Err(e) => { + error!("Failed to get dynamic resource '{}': {}", name, e); + Err(e) + } + } + } else { + // Real apply via server-side apply + // Server-side apply works for both create and update operations + debug!("Applying (server-side apply) dynamic resource '{}'", name); + match api + .patch(name, &patch_params, &Patch::Apply(resource)) + .await + { + Ok(obj) => Ok(obj), + Err(Error::Api(ErrorResponse { code: 404, .. })) => { + // Resource doesn't exist, server-side apply should create it + // This can happen with some API servers, so we explicitly create + debug!("Resource '{}' not found, creating via POST", name); + trace!("{resource:#?}"); + api.create(&PostParams::default(), resource) + .await + .map_err(|e| { + error!("Failed to create dynamic resource '{}': {}", name, e); + e + }) + } + Err(e) => { + error!("Failed to apply dynamic resource '{}': {}", name, e); + Err(e) + } + } + } + } + + /// Apply a resource in namespace + /// + /// See `kubectl apply` for more information on the expected behavior of this function + pub async fn apply(&self, resource: &K, namespace: Option<&str>) -> Result + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned + serde::Serialize, + ::Scope: ApplyStrategy, + ::DynamicType: Default, + { + debug!( + "Applying resource {:?} with ns {:?}", + resource.meta().name, + namespace + ); + trace!( + "{:#}", + serde_json::to_value(resource).unwrap_or(serde_json::Value::Null) + ); + + let api: Api = + <::Scope as ApplyStrategy>::get_api(&self.client, namespace); + // api.create(&PostParams::default(), &resource).await + let patch_params = PatchParams::apply("harmony"); + let name = resource + .meta() + .name + .as_ref() + .expect("K8s Resource should have a name"); + + if *crate::config::DRY_RUN { + match api.get(name).await { + Ok(current) => { + trace!("Received current value {current:#?}"); + // The resource exists, so we calculate and display a diff. + println!("\nPerforming dry-run for resource: '{name}'"); + let mut current_yaml = serde_yaml::to_value(¤t).unwrap_or_else(|_| { + panic!("Could not serialize current value : {current:#?}") + }); + if current_yaml.is_mapping() && current_yaml.get("status").is_some() { + let map = current_yaml.as_mapping_mut().unwrap(); + let removed = map.remove_entry("status"); + trace!("Removed status {removed:?}"); + } else { + trace!( + "Did not find status entry for current object {}/{}", + current.meta().namespace.as_ref().unwrap_or(&"".to_string()), + current.meta().name.as_ref().unwrap_or(&"".to_string()) + ); + } + let current_yaml = serde_yaml::to_string(¤t_yaml) + .unwrap_or_else(|_| "Failed to serialize current resource".to_string()); + let new_yaml = serde_yaml::to_string(resource) + .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); + + if current_yaml == new_yaml { + println!("No changes detected."); + // Return the current resource state as there are no changes. + return Ok(current); + } + + println!("Changes detected:"); + let diff = TextDiff::from_lines(¤t_yaml, &new_yaml); + + // Iterate over the changes and print them in a git-like diff format. + for change in diff.iter_all_changes() { + let sign = match change.tag() { + similar::ChangeTag::Delete => "-", + similar::ChangeTag::Insert => "+", + similar::ChangeTag::Equal => " ", + }; + print!("{sign}{change}"); + } + // In a dry run, we return the new resource state that would have been applied. + Ok(resource.clone()) + } + Err(Error::Api(ErrorResponse { code: 404, .. })) => { + // The resource does not exist, so the "diff" is the entire new resource. + println!("\nPerforming dry-run for new resource: '{name}'"); + println!( + "Resource does not exist. It would be created with the following content:" + ); + let new_yaml = serde_yaml::to_string(resource) + .unwrap_or_else(|_| "Failed to serialize new resource".to_string()); + + // Print each line of the new resource with a '+' prefix. + for line in new_yaml.lines() { + println!("+{line}"); + } + // In a dry run, we return the new resource state that would have been created. + Ok(resource.clone()) + } + Err(e) => { + // Another API error occurred. + error!("Failed to get resource '{name}': {e}"); + Err(e) + } + } + } else { + // Real apply via server-side apply + // Server-side apply works for both create and update operations + match api + .patch(name, &patch_params, &Patch::Apply(resource)) + .await + { + Ok(obj) => Ok(obj), + Err(Error::Api(ErrorResponse { code: 404, .. })) => { + // Resource doesn't exist, server-side apply should create it + // This can happen with some API servers, so we explicitly create + debug!("Resource '{}' not found, creating via POST", name); + api.create(&PostParams::default(), resource) + .await + .map_err(|e| { + error!("Failed to create resource '{}': {}", name, e); + e + }) + } + Err(e) => { + error!("Failed to apply resource '{}': {}", name, e); + Err(e) + } + } + } + } + + pub async fn apply_many(&self, resource: &[K], ns: Option<&str>) -> Result, Error> + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned + serde::Serialize, + ::Scope: ApplyStrategy, + ::DynamicType: Default, + { + let mut result = Vec::new(); + for r in resource.iter() { + let apply_result = self.apply(r, ns).await; + if apply_result.is_err() { + // NOTE : We should be careful about this one, it may leak sensitive information in + // logs + // Maybe just reducing it to debug would be enough as we already know debug logs + // are unsafe. + // But keeping it at warn makes it much easier to understand what is going on. So be it for now. + warn!( + "Failed to apply k8s resource : {}", + serde_json::to_string_pretty(r).map_err(|e| Error::SerdeError(e))? + ); + } + + result.push(apply_result?); + } + + Ok(result) + } + + pub async fn apply_yaml_many( + &self, + #[allow(clippy::ptr_arg)] yaml: &Vec, + ns: Option<&str>, + ) -> Result<(), Error> { + for y in yaml.iter() { + self.apply_yaml(y, ns).await?; + } + Ok(()) + } + + pub async fn apply_yaml( + &self, + yaml: &serde_yaml::Value, + ns: Option<&str>, + ) -> Result<(), Error> { + let obj: DynamicObject = serde_yaml::from_value(yaml.clone()).expect("TODO do not unwrap"); + let name = obj.metadata.name.as_ref().expect("YAML must have a name"); + + let api_version = yaml + .get("apiVersion") + .expect("couldn't get apiVersion from YAML") + .as_str() + .expect("couldn't get apiVersion as str"); + let kind = yaml + .get("kind") + .expect("couldn't get kind from YAML") + .as_str() + .expect("couldn't get kind as str"); + + let mut it = api_version.splitn(2, '/'); + let first = it.next().unwrap(); + let (g, v) = match it.next() { + Some(second) => (first, second), + None => ("", first), + }; + + let gvk = GroupVersionKind::gvk(g, v, kind); + let api_resource = ApiResource::from_gvk(&gvk); + + let namespace = match ns { + Some(n) => n, + None => obj + .metadata + .namespace + .as_ref() + .expect("YAML must have a namespace"), + }; + + // 5. Create a dynamic API client for this resource type. + let api: Api = + Api::namespaced_with(self.client.clone(), namespace, &api_resource); + + // 6. Apply the object to the cluster using Server-Side Apply. + // This will create the resource if it doesn't exist, or update it if it does. + println!("Applying '{name}' in namespace '{namespace}'...",); + let patch_params = PatchParams::apply("harmony"); // Use a unique field manager name + let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?; + + println!("Successfully applied '{}'.", result.name_any()); + + Ok(()) + } + + /// Apply a resource from a URL + /// + /// It is the equivalent of `kubectl apply -f ` + pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> { + let patch_params = PatchParams::apply("harmony"); + let discovery = self.discovery().await?; + + let yaml = reqwest::get(url) + .await + .expect("Could not get URL") + .text() + .await + .expect("Could not get content from URL"); + + for doc in multidoc_deserialize(&yaml).expect("failed to parse YAML from file") { + let obj: DynamicObject = + serde_yaml::from_value(doc).expect("cannot apply without valid YAML"); + let namespace = obj.metadata.namespace.as_deref().or(ns); + let type_meta = obj + .types + .as_ref() + .expect("cannot apply object without valid TypeMeta"); + let gvk = GroupVersionKind::try_from(type_meta) + .expect("cannot apply object without valid GroupVersionKind"); + let name = obj.name_any(); + + if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) { + let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false); + trace!( + "Applying {}: \n{}", + gvk.kind, + serde_yaml::to_string(&obj).expect("Failed to serialize YAML") + ); + let data: serde_json::Value = + serde_json::to_value(&obj).expect("Failed to serialize JSON"); + let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?; + debug!("applied {} {}", gvk.kind, name); + } else { + warn!("Cannot apply document for unknown {gvk:?}"); + } + } + + Ok(()) + } + + /// Gets a single named resource of a specific type `K`. + /// + /// This function uses the `ApplyStrategy` trait to correctly determine + /// whether to look in a specific namespace or in the entire cluster. + /// + /// Returns `Ok(None)` if the resource is not found (404). + pub async fn get_resource( + &self, + name: &str, + namespace: Option<&str>, + ) -> Result, Error> + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned, + ::Scope: ApplyStrategy, + ::DynamicType: Default, + { + let api: Api = + <::Scope as ApplyStrategy>::get_api(&self.client, namespace); + + api.get_opt(name).await + } + + pub async fn list_all_resources_with_labels(&self, labels: &str) -> Result, Error> + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned, + ::DynamicType: Default, + { + let api: Api = Api::all(self.client.clone()); + + let lp = ListParams::default().labels(labels); + Ok(api.list(&lp).await?.items) + } + + pub async fn get_all_resource_in_all_namespace(&self) -> Result, Error> + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned, + ::Scope: ApplyStrategy, + ::DynamicType: Default, + { + let api: Api = Api::all(self.client.clone()); + Ok(api.list(&Default::default()).await?.items) + } + + /// Lists all resources of a specific type `K`. + /// + /// This function uses the `ApplyStrategy` trait to correctly determine + /// whether to list from a specific namespace or from the entire cluster. + pub async fn list_resources( + &self, + namespace: Option<&str>, + list_params: Option, + ) -> Result, Error> + where + K: Resource + Clone + std::fmt::Debug + DeserializeOwned, + ::Scope: ApplyStrategy, + ::DynamicType: Default, + { + let api: Api = + <::Scope as ApplyStrategy>::get_api(&self.client, namespace); + + let list_params = list_params.unwrap_or_default(); + api.list(&list_params).await + } + + /// Fetches a list of all Nodes in the cluster. + pub async fn get_nodes( + &self, + list_params: Option, + ) -> Result, Error> { + self.list_resources(None, list_params).await + } + + pub async fn from_kubeconfig(path: &str) -> Option { + Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await + } + + pub async fn from_kubeconfig_with_context( + path: &str, + context: Option, + ) -> Option { + let mut opts = KubeConfigOptions::default(); + opts.context = context; + + Self::from_kubeconfig_with_opts(path, &opts).await + } + + pub async fn from_kubeconfig_with_opts( + path: &str, + opts: &KubeConfigOptions, + ) -> Option { + let k = match Kubeconfig::read_from(path) { + Ok(k) => k, + Err(e) => { + error!("Failed to load kubeconfig from {path} : {e}"); + return None; + } + }; + + Some(K8sClient::new( + Client::try_from(Config::from_custom_kubeconfig(k, &opts).await.unwrap()).unwrap(), + )) + } + + pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> { + let api: Api = Api::all(self.client.clone()); + + api.cordon(node_name).await?; + Ok(()) + } + + pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> { + let api: Api = Api::all(self.client.clone()); + + api.uncordon(node_name).await?; + + Ok(()) + } + + /// Lists every pod currently scheduled on `node_name`. + async fn list_pods_on_node(&self, node_name: &str) -> Result, Error> { + let api: Api = Api::all(self.client.clone()); + let lp = ListParams::default().fields(&format!("spec.nodeName={}", node_name)); + Ok(api.list(&lp).await?.items) + } + + /// Returns `true` when the pod is a *mirror pod* (a static manifest + /// managed directly by the kubelet). + fn is_mirror_pod(pod: &Pod) -> bool { + pod.metadata + .annotations + .as_ref() + .map(|a| a.contains_key("kubernetes.io/config.mirror")) + .unwrap_or(false) + } + + /// Returns `true` when the pod is owned by a `DaemonSet`. + fn is_daemonset_pod(pod: &Pod) -> bool { + pod.metadata + .owner_references + .as_ref() + .map(|refs| refs.iter().any(|r| r.kind == "DaemonSet")) + .unwrap_or(false) + } + + /// Returns `true` when the pod spec contains at least one `emptyDir` + /// volume. + fn has_emptydir_volume(pod: &Pod) -> bool { + pod.spec + .as_ref() + .and_then(|s| s.volumes.as_ref()) + .map(|vols| vols.iter().any(|v| v.empty_dir.is_some())) + .unwrap_or(false) + } + + /// Returns `true` when the pod has already terminated (`Succeeded` or + /// `Failed`). + fn is_completed_pod(pod: &Pod) -> bool { + pod.status + .as_ref() + .and_then(|s| s.phase.as_deref()) + .map(|phase| phase == "Succeeded" || phase == "Failed") + .unwrap_or(false) + } + + /// Partitions `pods` into *(evictable, skipped_descriptions)*. + /// + /// Returns `Err` with a human-readable message when one or more pods would + /// block the drain (e.g. a `DaemonSet` pod with `ignore_daemonsets = + /// false`). + fn classify_pods_for_drain( + pods: &[Pod], + options: &DrainOptions, + ) -> Result<(Vec, Vec), String> { + let mut evictable: Vec = Vec::new(); + let mut skipped: Vec = Vec::new(); + let mut blocking: Vec = Vec::new(); + + for pod in pods { + let name = pod.metadata.name.as_deref().unwrap_or(""); + let ns = pod.metadata.namespace.as_deref().unwrap_or(""); + let qualified = format!("{}/{}", ns, name); + + // Mirror pods are managed by the kubelet — never evict. + if Self::is_mirror_pod(pod) { + skipped.push(format!("{} (mirror pod)", qualified)); + continue; + } + + // Already-terminated pods do not need eviction. + if Self::is_completed_pod(pod) { + skipped.push(format!("{} (completed)", qualified)); + continue; + } + + // DaemonSet pods: skip or block depending on options. + if Self::is_daemonset_pod(pod) { + if options.ignore_daemonsets { + skipped.push(format!("{} (DaemonSet-managed)", qualified)); + } else { + blocking.push(format!( + "{} is managed by a DaemonSet (set ignore_daemonsets to skip)", + qualified + )); + } + continue; + } + + // Pods with emptyDir data: block unless explicitly allowed. + if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data { + blocking.push(format!( + "{} uses emptyDir volumes (set delete_emptydir_data to allow eviction)", + qualified + )); + continue; + } + + evictable.push(pod.clone()); + } + + if !blocking.is_empty() { + return Err(format!( + "Cannot drain node — the following pods block eviction:\n - {}", + blocking.join("\n - ") + )); + } + + Ok((evictable, skipped)) + } + + async fn wait_for_pod_completion(&self, name: &str, namespace: &str) -> Result { + let pod_api: Api = Api::namespaced(self.client.clone(), namespace); + let poll_interval = Duration::from_secs(2); + for _ in 0..60 { + // 2 minutes timeout + sleep(poll_interval).await; + let p = pod_api.get(name).await?; + if let Some(status) = p.status { + match status.phase.as_deref() { + Some("Succeeded") => { + // Capture pod logs as output + let logs = pod_api + .logs(name, &Default::default()) + .await + .unwrap_or_else(|_| String::new()); + + debug!("Retrieved pod {namespace}/{name} logs {logs}"); + + return Ok(logs); + } + Some("Failed") => { + let logs = pod_api + .logs(name, &Default::default()) + .await + .unwrap_or_else(|_| String::new()); + + debug!("Retrieved failed pod {namespace}/{name} logs {logs}"); + + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Pod {} failed. Logs:\n{}", + name, logs + )))); + } + _ => {} + } + } + } + Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Timed out waiting for pod {}", + name + )))) + } + + pub async fn get_k8s_distribution(&self) -> Result { + self.k8s_distribution + .get_or_try_init(async || { + debug!("Trying to detect k8s distribution"); + let api_groups = self.client.list_api_groups().await?; + trace!("list_api_groups {:?}", api_groups); + debug!("K8s discovery completed"); + + let version = self.get_apiserver_version().await?; + + // OpenShift / OKD + if api_groups + .groups + .iter() + .any(|g| g.name == "project.openshift.io") + { + info!("Found KubernetesDistribution OpenshiftFamily"); + return Ok(KubernetesDistribution::OpenshiftFamily); + } + + // K3d / K3s + if version.git_version.contains("k3s") { + info!("Found KubernetesDistribution K3sFamily"); + return Ok(KubernetesDistribution::K3sFamily); + } + + info!("Could not identify KubernetesDistribution, using Default"); + return Ok(KubernetesDistribution::Default); + }) + .await + .map(|k| k.clone()) + } + + /// Writes a set of files to a node's filesystem using a privileged ephemeral pod. + /// + /// This method creates a ConfigMap containing the file contents and a privileged Pod + /// that mounts the host filesystem. It then copies the files from the ConfigMap + /// to the specified paths on the host and sets the requested permissions. + /// + /// On OpenShift clusters, the required SCC binding is automatically created via + /// the ResourceBundle pattern. + /// + /// ## Use Case: Network Bonding Configuration (ADR-019) + /// + /// This method is designed to support operations like writing NetworkManager + /// configuration files to `/etc/NetworkManager/system-connections/` for + /// setting up LACP bonds on worker nodes, where interface names vary across + /// hardware. + /// + /// Files written via this method persist across reboots on Fedora CoreOS/SCOS. + /// + /// # Arguments + /// + /// * `node_name` - The name of the node to write files to + /// * `files` - A slice of [`NodeFile`] structs containing path, content, and permissions + /// + /// # Example + /// + /// ```rust,no_run + /// # use harmony::topology::k8s::{K8sClient, NodeFile}; + /// # async fn example(client: K8sClient) { + /// let bond_config = NodeFile { + /// path: "/etc/NetworkManager/system-connections/bond0.nmconnection".to_string(), + /// content: "[connection]\nid=bond0\n...".to_string(), + /// mode: 0o600, + /// }; + /// + /// client.write_files_to_node("worker-01", &[bond_config]).await.unwrap(); + /// # } + /// ``` + pub async fn write_files_to_node( + &self, + node_name: &str, + files: &[NodeFile], + ) -> Result { + let ns = self.client.default_namespace(); + let suffix = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis(); + let name = format!("harmony-writer-{}", suffix); + + debug!( + "Preparing to write {} files to node '{}'", + files.len(), + node_name + ); + + // 1. Prepare ConfigMap data & Script + let mut data = BTreeMap::new(); + let mut script = String::from("set -e\n"); + + for (i, file) in files.iter().enumerate() { + let key = format!("f{}", i); + data.insert(key.clone(), file.content.clone()); + + // Ensure parent dir exists + script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path)); + // Copy file + script.push_str(&format!("cp \"/payload/{}\" \"/host{}\"\n", key, file.path)); + // Chmod (format as octal) + script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path)); + } + + let cm = ConfigMap { + metadata: ObjectMeta { + name: Some(name.clone()), + namespace: Some(ns.to_string()), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }; + + let cm_api: Api = Api::namespaced(self.client.clone(), ns); + cm_api.create(&PostParams::default(), &cm).await?; + debug!("Created ConfigMap {}", name); + + // 2. Build resource bundle with Pod and RBAC + let (host_vol, host_mount) = helper::host_root_volume(); + + let payload_vol = Volume { + name: "payload".to_string(), + config_map: Some(ConfigMapVolumeSource { + name: name.clone(), + ..Default::default() + }), + ..Default::default() + }; + + let payload_mount = VolumeMount { + name: "payload".to_string(), + mount_path: "/payload".to_string(), + ..Default::default() + }; + + let bundle = helper::build_privileged_bundle( + PrivilegedPodConfig { + name: name.clone(), + namespace: ns.to_string(), + node_name: node_name.to_string(), + container_name: "writer".to_string(), + command: vec!["/bin/bash".to_string(), "-c".to_string(), script], + volumes: vec![payload_vol, host_vol], + volume_mounts: vec![payload_mount, host_mount], + host_pid: false, + host_network: false, + }, + &self.get_k8s_distribution().await?, + ); + + // 3. Apply bundle (RBAC + Pod) + bundle.apply(self).await?; + debug!("Created privileged pod bundle {}", name); + + // 4. Wait for completion + let result = self.wait_for_pod_completion(&name, ns).await; + + // 5. Cleanup + debug!("Cleaning up resources for {}", name); + let _ = bundle.delete(self).await; + let _ = cm_api.delete(&name, &DeleteParams::default()).await; + + result + } + + /// Runs a privileged command on a specific node using an ephemeral pod. + /// + /// This method creates a privileged pod with host PID and network namespaces + /// enabled, along with the host filesystem mounted at `/host`. The pod runs + /// the specified command and waits for completion. + /// + /// On OpenShift clusters, the required SCC binding is automatically created via + /// the ResourceBundle pattern. + /// + /// # Arguments + /// + /// * `node_name` - The name of the node to run the command on + /// * `command` - The shell command to execute (runs in `/bin/bash -c`) + /// + /// # Returns + /// + /// The stdout output from the command execution. + /// + /// # Example + /// + /// ```rust,no_run + /// # use harmony::topology::k8s::K8sClient; + /// # async fn example(client: K8sClient) { + /// // Reload NetworkManager configuration after writing .nmconnection files + /// let output = client.run_privileged_command_on_node( + /// "worker-01", + /// "nmcli connection reload" + /// ).await.unwrap(); + /// # } + /// ``` + pub async fn run_privileged_command_on_node( + &self, + node_name: &str, + command: &str, + ) -> Result { + let namespace = self.client.default_namespace(); + let suffix = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis(); + let name = format!("harmony-cmd-{}", suffix); + + debug!( + "Running privileged command on node '{}': {}", + node_name, command + ); + + // Build resource bundle with Pod and RBAC + let (host_vol, host_mount) = helper::host_root_volume(); + trace!("Got host volume {host_vol:#?}"); + trace!("Got host volume mount {host_mount:#?}"); + let bundle = helper::build_privileged_bundle( + PrivilegedPodConfig { + name: name.clone(), + namespace: namespace.to_string(), + node_name: node_name.to_string(), + container_name: "runner".to_string(), + command: vec![ + "/bin/bash".to_string(), + "-c".to_string(), + command.to_string(), + ], + volumes: vec![host_vol], + volume_mounts: vec![host_mount], + host_pid: true, + host_network: true, + }, + &self.get_k8s_distribution().await?, + ); + + debug!("Built privileged bundle {bundle:#?}"); + debug!("Built privileged bundle for command : {command}"); + + // Apply bundle (RBAC + Pod) + bundle.apply(self).await?; + debug!("Created privileged pod bundle {}", name); + + // Wait for completion + let result = self.wait_for_pod_completion(&name, namespace).await; + + // Cleanup + debug!("Cleaning up resources for {}", name); + let _ = bundle.delete(self).await; + + result + } + + /// Reboots a Kubernetes node safely with proper drain/uncordon cycle. + /// + /// This method implements a robust node reboot procedure: + /// 1. Records the current boot ID from node status + /// 2. Drains the node (cordons + evicts all pods) + /// 3. Issues a delayed reboot command (fire-and-forget) + /// 4. Waits for the node to go NotReady (confirms shutdown started) + /// 5. Waits for the node to become Ready again + /// 6. Verifies the boot ID changed (confirms actual reboot occurred) + /// 7. Uncordons the node + /// + /// # Arguments + /// + /// * `node_name` - The name of the node to reboot + /// * `drain_options` - Options controlling pod eviction behavior + /// * `timeout` - Maximum time to wait for the entire reboot cycle + /// + /// # Example + /// + /// ```rust,no_run + /// # use harmony::topology::k8s::{K8sClient, DrainOptions}; + /// # use std::time::Duration; + /// # async fn example(client: K8sClient) { + /// client.reboot_node( + /// "worker-01", + /// &DrainOptions::default_ignore_daemonset_delete_emptydir_data(), + /// Duration::from_secs(3600) // 1 hour timeout + /// ).await.unwrap(); + /// # } + /// ``` + pub async fn reboot_node( + &self, + node_name: &str, + drain_options: &DrainOptions, + timeout: Duration, + ) -> Result<(), Error> { + info!("Starting reboot procedure for node '{}'", node_name); + + // 1. Get current boot ID from node status + let node_api: Api = Api::all(self.client.clone()); + let node = node_api.get(node_name).await?; + let boot_id_before = node + .status + .as_ref() + .and_then(|s| s.node_info.as_ref()) + .and_then(|ni| Some(ni.boot_id.clone())) + .ok_or_else(|| { + Error::Discovery(DiscoveryError::MissingResource(format!( + "Node '{}' does not have boot_id in status", + node_name + ))) + })?; + debug!( + "Current boot_id for node '{}': {}", + node_name, boot_id_before + ); + + // 2. Drain the node + info!("Draining node '{}'...", node_name); + self.drain_node(node_name, drain_options).await?; + + let start_time = tokio::time::Instant::now(); + + // 3. Issue delayed reboot command (fire-and-forget) + info!("Scheduling reboot for node '{}'...", node_name); + let reboot_cmd = + "echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'"; + + // Ignore errors - the pod will die during shutdown and we can't wait for completion + match self + .run_privileged_command_on_node(node_name, reboot_cmd) + .await + { + Ok(_) => debug!("Reboot command scheduled successfully"), + Err(e) => { + // This is expected - the node may start shutting down before we can read the pod status + debug!( + "Reboot command scheduling completed with error (expected): {}", + e + ); + } + } + + // 4. Wait for node to go NotReady (proves shutdown started) + info!("Waiting for node '{}' to begin shutdown...", node_name); + let remaining_timeout = timeout.saturating_sub(start_time.elapsed()); + self.wait_for_node_not_ready(node_name, remaining_timeout) + .await?; + + if start_time.elapsed() > timeout { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Timeout during node '{}' reboot (shutdown detection phase)", + node_name + )))); + } + + // 5. Wait for node to become Ready again + info!("Waiting for node '{}' to come back online...", node_name); + let remaining_timeout = timeout.saturating_sub(start_time.elapsed()); + self.wait_for_node_ready_with_timeout(node_name, remaining_timeout) + .await?; + + if start_time.elapsed() > timeout { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Timeout during node '{}' reboot (ready phase)", + node_name + )))); + } + + // 6. Verify boot ID changed (confirms actual reboot) + info!("Verifying node '{}' actually rebooted...", node_name); + let node = node_api.get(node_name).await?; + let boot_id_after = node + .status + .as_ref() + .and_then(|s| s.node_info.as_ref()) + .and_then(|ni| Some(ni.boot_id.clone())) + .ok_or_else(|| { + Error::Discovery(DiscoveryError::MissingResource(format!( + "Node '{}' does not have boot_id in status after reboot", + node_name + ))) + })?; + + if boot_id_before == boot_id_after { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Node '{}' did not actually reboot (boot_id unchanged: {})", + node_name, boot_id_before + )))); + } + + debug!( + "Node '{}' boot_id changed: {} -> {}", + node_name, boot_id_before, boot_id_after + ); + + // 7. Uncordon the node + info!("Uncordoning node '{}'...", node_name); + self.uncordon_node(node_name).await?; + + info!( + "Successfully rebooted node '{}' (took {:?})", + node_name, + start_time.elapsed() + ); + + Ok(()) + } + + /// Waits for a node to transition to NotReady status. + /// + /// This is useful for detecting when a node shutdown has begun. + async fn wait_for_node_not_ready( + &self, + node_name: &str, + timeout: Duration, + ) -> Result<(), Error> { + let api: Api = Api::all(self.client.clone()); + let poll_interval = Duration::from_secs(5); + let start = tokio::time::Instant::now(); + + loop { + if start.elapsed() > timeout { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Node '{}' did not become NotReady within {:?}", + node_name, timeout + )))); + } + + match api.get(node_name).await { + Ok(node) => { + if let Some(status) = node.status { + if let Some(conditions) = status.conditions { + let is_ready = conditions + .iter() + .any(|cond| cond.type_ == "Ready" && cond.status == "True"); + + if !is_ready { + debug!("Node '{}' is now NotReady", node_name); + return Ok(()); + } + } + } + } + Err(e) => { + debug!("Error checking node '{}' status: {}", node_name, e); + } + } + + sleep(poll_interval).await; + } + } + + pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> { + // Default 10 minute timeout for backwards compatibility + self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600)) + .await + } + + /// Waits for a node to become Ready with a custom timeout. + async fn wait_for_node_ready_with_timeout( + &self, + node_name: &str, + timeout: Duration, + ) -> Result<(), Error> { + let api: Api = Api::all(self.client.clone()); + let poll_interval = Duration::from_secs(5); + let start = tokio::time::Instant::now(); + + loop { + if start.elapsed() > timeout { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Node '{}' did not become ready within {:?}", + node_name, timeout + )))); + } + + match api.get(node_name).await { + Ok(node) => { + if let Some(status) = node.status { + if let Some(conditions) = status.conditions { + for cond in conditions { + if cond.type_ == "Ready" && cond.status == "True" { + debug!("Node '{}' is now Ready", node_name); + return Ok(()); + } + } + } + } + } + Err(e) => { + debug!("Failed to get node '{}': {}", node_name, e); + } + } + + sleep(poll_interval).await; + } + } + + /// Sends a single eviction request for `pod`. + async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> { + let name = pod.metadata.name.as_deref().unwrap_or_default(); + let ns = pod.metadata.namespace.as_deref().unwrap_or_default(); + let api: Api = Api::namespaced(self.client.clone(), ns); + debug!("Sending eviction for pod {}/{}", ns, name); + api.evict(name, &EvictParams::default()).await.map(|_| ()) + } + + /// Drains a node by cordoning it, evicting eligible pods, and waiting for + /// them to terminate. + /// + /// The operation mirrors `kubectl drain`: + /// 1. **Cordon** — marks the node as unschedulable. + /// 2. **Classify** — separates pods into evictable / skipped / blocking. + /// 3. **Evict & wait** — sends eviction requests and re-tries on each + /// polling interval until every pod is gone or the timeout expires. + /// + /// Re-sending eviction requests each iteration ensures that pods + /// previously blocked by a `PodDisruptionBudget` are retried once budget + /// becomes available. + /// + /// # Errors + /// Returns an error if the node cannot be cordoned, if any pod blocks + /// eviction (see [`DrainOptions`]), or if evictions do not complete within + /// the configured timeout. + pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> { + // ── 1. Cordon ────────────────────────────────────────────────── + debug!("Cordoning node '{}'", node_name); + self.cordon_node(node_name).await?; + + // ── 2. List & classify pods ──────────────────────────────────── + let pods = self.list_pods_on_node(node_name).await?; + debug!("Found {} pod(s) on node '{}'", pods.len(), node_name); + + let (evictable, skipped) = + Self::classify_pods_for_drain(&pods, options).map_err(|msg| { + error!("{}", msg); + Error::Discovery(DiscoveryError::MissingResource(msg)) + })?; + + for s in &skipped { + info!("Skipping pod: {}", s); + } + + if evictable.is_empty() { + info!("No pods to evict on node '{}'", node_name); + return Ok(()); + } + + info!( + "Evicting {} pod(s) from node '{}'", + evictable.len(), + node_name + ); + + // ── 3. Evict & wait loop ────────────────────────────────────── + let start = tokio::time::Instant::now(); + let poll_interval = Duration::from_secs(5); + let mut pending = evictable; + + loop { + // Send (or re-send) eviction requests for all pending pods. + for pod in &pending { + match self.evict_pod(pod).await { + Ok(()) => {} + // Pod already gone — will be filtered out below. + Err(Error::Api(ErrorResponse { code: 404, .. })) => {} + // PDB is blocking — will retry next iteration. + Err(Error::Api(ErrorResponse { code: 429, .. })) => { + warn!( + "PDB prevented eviction of {}/{}; will retry", + pod.metadata.namespace.as_deref().unwrap_or(""), + pod.metadata.name.as_deref().unwrap_or("") + ); + } + Err(e) => { + error!( + "Failed to evict pod {}/{}: {}", + pod.metadata.namespace.as_deref().unwrap_or(""), + pod.metadata.name.as_deref().unwrap_or(""), + e + ); + return Err(e); + } + } + } + + // Wait before polling pod presence. + sleep(poll_interval).await; + + // Check which pods are still present on the API server. + let mut still_present: Vec = Vec::new(); + for pod in pending { + let ns = pod.metadata.namespace.as_deref().unwrap_or_default(); + let name = pod.metadata.name.as_deref().unwrap_or_default(); + match self.get_pod(name, Some(ns)).await? { + Some(_) => still_present.push(pod), + None => debug!("Pod {}/{} evicted successfully", ns, name), + } + } + + pending = still_present; + + if pending.is_empty() { + break; + } + + if start.elapsed() > options.timeout { + let names: Vec = pending + .iter() + .map(|p| { + format!( + "{}/{}", + p.metadata.namespace.as_deref().unwrap_or(""), + p.metadata.name.as_deref().unwrap_or("") + ) + }) + .collect(); + let msg = format!( + "Timed out after {:?} waiting for pod evictions on node '{}'. Remaining:\n - {}", + options.timeout, + node_name, + names.join("\n - ") + ); + + warn!("{}", msg); + + // Prompt user for action + match helper::prompt_drain_timeout_action( + node_name, + pending.len(), + options.timeout, + )? { + helper::DrainTimeoutAction::Accept => { + // User confirmed acceptance - break the loop and continue + break; + } + helper::DrainTimeoutAction::Retry => { + // Reset the start time to retry for another full timeout period + let start = tokio::time::Instant::now(); + continue; + } + helper::DrainTimeoutAction::Abort => { + return Err(Error::Discovery(DiscoveryError::MissingResource(format!( + "Drain operation aborted. {} pods remaining on node '{}'", + pending.len(), + node_name + )))); + } + } + } + + debug!( + "Waiting for {} pod(s) to terminate on node '{}'", + pending.len(), + node_name + ); + } + + debug!("Node '{}' drained successfully", node_name); + Ok(()) + } +} + +fn get_dynamic_api( + resource: ApiResource, + capabilities: ApiCapabilities, + client: Client, + ns: Option<&str>, + all: bool, +) -> Api { + if capabilities.scope == Scope::Cluster || all { + Api::all_with(client, &resource) + } else if let Some(namespace) = ns { + Api::namespaced_with(client, namespace, &resource) + } else { + Api::default_namespaced_with(client, &resource) + } +} + +fn multidoc_deserialize(data: &str) -> Result, serde_yaml::Error> { + use serde::Deserialize; + let mut docs = vec![]; + for de in serde_yaml::Deserializer::from_str(data) { + docs.push(serde_yaml::Value::deserialize(de)?); + } + Ok(docs) +} + +pub trait ApplyStrategy { + fn get_api(client: &Client, ns: Option<&str>) -> Api; +} + +/// Implementation for all resources that are cluster-scoped. +/// It will always use `Api::all` and ignore the namespace parameter. +impl ApplyStrategy for ClusterResourceScope +where + K: Resource, + ::DynamicType: Default, +{ + fn get_api(client: &Client, _ns: Option<&str>) -> Api { + Api::all(client.clone()) + } +} + +/// Implementation for all resources that are namespace-scoped. +/// It will use `Api::namespaced` if a namespace is provided, otherwise +/// it falls back to the default namespace configured in your kubeconfig. +impl ApplyStrategy for NamespaceResourceScope +where + K: Resource, + ::DynamicType: Default, +{ + fn get_api(client: &Client, ns: Option<&str>) -> Api { + match ns { + Some(ns) => Api::namespaced(client.clone(), ns), + None => Api::default_namespaced(client.clone()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, Pod, PodSpec, PodStatus, Volume}; + use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; + use std::collections::BTreeMap; + + // ── Test helpers ──────────────────────────────────────────────────── + + /// Builds a minimal pod with the given name/namespace and no special + /// annotations, owner refs, volumes, or status. + fn base_pod(name: &str, ns: &str) -> Pod { + Pod { + metadata: ObjectMeta { + name: Some(name.to_string()), + namespace: Some(ns.to_string()), + ..Default::default() + }, + spec: Some(PodSpec::default()), + status: Some(PodStatus { + phase: Some("Running".to_string()), + ..Default::default() + }), + } + } + + fn mirror_pod(name: &str, ns: &str) -> Pod { + let mut pod = base_pod(name, ns); + let mut annotations = BTreeMap::new(); + annotations.insert( + "kubernetes.io/config.mirror".to_string(), + "abc123".to_string(), + ); + pod.metadata.annotations = Some(annotations); + pod + } + + fn daemonset_pod(name: &str, ns: &str) -> Pod { + let mut pod = base_pod(name, ns); + pod.metadata.owner_references = Some(vec![OwnerReference { + api_version: "apps/v1".to_string(), + kind: "DaemonSet".to_string(), + name: "some-ds".to_string(), + uid: "uid-ds".to_string(), + ..Default::default() + }]); + pod + } + + fn emptydir_pod(name: &str, ns: &str) -> Pod { + let mut pod = base_pod(name, ns); + pod.spec = Some(PodSpec { + volumes: Some(vec![Volume { + name: "scratch".to_string(), + empty_dir: Some(EmptyDirVolumeSource::default()), + ..Default::default() + }]), + ..Default::default() + }); + pod + } + + fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod { + let mut pod = base_pod(name, ns); + pod.status = Some(PodStatus { + phase: Some(phase.to_string()), + ..Default::default() + }); + pod + } + + fn default_opts() -> DrainOptions { + DrainOptions::default() + } + + // ── Tests ─────────────────────────────────────────────────────────── + + #[test] + fn empty_pod_list_returns_empty_vecs() { + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert!(skipped.is_empty()); + } + + #[test] + fn normal_pod_is_evictable() { + let pods = vec![base_pod("web", "default")]; + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap(); + assert_eq!(evictable.len(), 1); + assert_eq!(evictable[0].metadata.name.as_deref(), Some("web")); + assert!(skipped.is_empty()); + } + + #[test] + fn mirror_pod_is_skipped() { + let pods = vec![mirror_pod("kube-apiserver", "kube-system")]; + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!(skipped[0].contains("mirror pod")); + } + + #[test] + fn completed_succeeded_pod_is_skipped() { + let pods = vec![completed_pod("job-xyz", "batch", "Succeeded")]; + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!(skipped[0].contains("completed")); + } + + #[test] + fn completed_failed_pod_is_skipped() { + let pods = vec![completed_pod("job-fail", "batch", "Failed")]; + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!(skipped[0].contains("completed")); + } + + #[test] + fn daemonset_pod_skipped_when_ignore_daemonsets_true() { + let pods = vec![daemonset_pod("fluentd", "logging")]; + let opts = DrainOptions { + ignore_daemonsets: true, + ..default_opts() + }; + let (evictable, skipped) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!(skipped[0].contains("DaemonSet-managed")); + } + + #[test] + fn daemonset_pod_blocks_when_ignore_daemonsets_false() { + let pods = vec![daemonset_pod("fluentd", "logging")]; + let opts = DrainOptions { + ignore_daemonsets: false, + ..default_opts() + }; + let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err(); + assert!(err.contains("DaemonSet")); + assert!(err.contains("logging/fluentd")); + } + + #[test] + fn emptydir_pod_blocks_when_delete_emptydir_data_false() { + let pods = vec![emptydir_pod("cache", "default")]; + let opts = DrainOptions { + delete_emptydir_data: false, + ..default_opts() + }; + let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err(); + assert!(err.contains("emptyDir")); + assert!(err.contains("default/cache")); + } + + #[test] + fn emptydir_pod_evictable_when_delete_emptydir_data_true() { + let pods = vec![emptydir_pod("cache", "default")]; + let opts = DrainOptions { + delete_emptydir_data: true, + ..default_opts() + }; + let (evictable, skipped) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap(); + assert_eq!(evictable.len(), 1); + assert_eq!(evictable[0].metadata.name.as_deref(), Some("cache")); + assert!(skipped.is_empty()); + } + + #[test] + fn multiple_blocking_pods_all_reported() { + let pods = vec![daemonset_pod("ds-a", "ns1"), emptydir_pod("ed-b", "ns2")]; + let opts = DrainOptions { + ignore_daemonsets: false, + delete_emptydir_data: false, + ..default_opts() + }; + let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err(); + assert!(err.contains("ns1/ds-a")); + assert!(err.contains("ns2/ed-b")); + } + + #[test] + fn mixed_pods_classified_correctly() { + let pods = vec![ + base_pod("web", "default"), + mirror_pod("kube-apiserver", "kube-system"), + daemonset_pod("fluentd", "logging"), + completed_pod("job-done", "batch", "Succeeded"), + base_pod("api", "default"), + ]; + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap(); + + let evict_names: Vec<&str> = evictable + .iter() + .map(|p| p.metadata.name.as_deref().unwrap()) + .collect(); + assert_eq!(evict_names, vec!["web", "api"]); + assert_eq!(skipped.len(), 3); + } + + #[test] + fn classification_priority_mirror_before_completed() { + // A mirror pod that also has phase=Succeeded should still be + // classified as "mirror pod" (the first check wins). + let mut pod = mirror_pod("static-etcd", "kube-system"); + pod.status = Some(PodStatus { + phase: Some("Succeeded".to_string()), + ..Default::default() + }); + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!( + skipped[0].contains("mirror pod"), + "expected mirror-pod label, got: {}", + skipped[0] + ); + } + + #[test] + fn classification_priority_completed_before_daemonset() { + // A completed DaemonSet pod should be skipped as "completed", + // not as "DaemonSet-managed". + let mut pod = daemonset_pod("collector", "monitoring"); + pod.status = Some(PodStatus { + phase: Some("Failed".to_string()), + ..Default::default() + }); + let (evictable, skipped) = + K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap(); + assert!(evictable.is_empty()); + assert_eq!(skipped.len(), 1); + assert!( + skipped[0].contains("completed"), + "expected completed label, got: {}", + skipped[0] + ); + } + + #[test] + fn pod_with_no_metadata_names_uses_unknown_placeholder() { + let pod = Pod { + metadata: ObjectMeta::default(), + spec: Some(PodSpec::default()), + status: Some(PodStatus { + phase: Some("Running".to_string()), + ..Default::default() + }), + }; + let (evictable, _) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap(); + assert_eq!(evictable.len(), 1); + } +} + +#[cfg(test)] +mod apply_tests { + //! Integration tests for apply() and apply_dynamic() functions. + //! + //! ## Testing Strategy + //! + //! These functions interact with the Kubernetes API server, making them difficult + //! to unit test. We recommend a multi-layered testing approach: + //! + //! ### 1. **Integration Tests with Real Cluster (Recommended)** + //! - Use a local development cluster (kind, k3d, minikube) + //! - Place tests in `tests/` directory for optional execution + //! - Run with: `cargo test --test k8s_apply_integration -- --ignored` + //! + //! ### 2. **Contract Tests with Mock Server** + //! - Use `wiremock` or `mockito` to simulate Kubernetes API responses + //! - Test specific scenarios: 404 → create, 200 → update, error cases + //! - Fast, deterministic, no cluster required + //! + //! ### 3. **Property-Based Tests** + //! - Use `proptest` to generate various resource configurations + //! - Verify idempotency: apply(x) → apply(x) should not error + //! + //! ### 4. **Example Tests Below** + //! - These demonstrate the testing patterns + //! - Marked with `#[ignore]` to require opt-in execution + //! - Can be run in CI with proper cluster setup + //! + //! ## Running Tests + //! + //! ```bash + //! # Setup test cluster + //! kind create cluster --name harmony-test + //! + //! # Run integration tests + //! cargo test --test k8s_apply_integration + //! + //! # Or run ignored tests in this module + //! cargo test apply_tests -- --ignored --nocapture + //! ``` + + use kube::api::TypeMeta; + + use super::*; + + /// Example integration test for apply() with ConfigMap creation. + /// + /// This test requires a real Kubernetes cluster and is marked as ignored. + /// Run with: `cargo test apply_creates_new_configmap -- --ignored` + #[tokio::test] + #[ignore = "requires kubernetes cluster"] + async fn apply_creates_new_configmap() { + let client = K8sClient::try_default() + .await + .expect("failed to create client"); + let test_ns = "default"; + let cm_name = format!( + "test-cm-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + ); + + let mut data = BTreeMap::new(); + data.insert("key1".to_string(), "value1".to_string()); + + let configmap = ConfigMap { + metadata: ObjectMeta { + name: Some(cm_name.clone()), + namespace: Some(test_ns.to_string()), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }; + + // Apply should create the resource + let result = client.apply(&configmap, Some(test_ns)).await; + assert!( + result.is_ok(), + "failed to apply new ConfigMap: {:?}", + result.err() + ); + + // Verify it exists + let fetched: Option = + client.get_resource(&cm_name, Some(test_ns)).await.unwrap(); + assert!(fetched.is_some(), "ConfigMap was not created"); + assert_eq!( + fetched.unwrap().data.unwrap().get("key1").unwrap(), + "value1" + ); + + // Cleanup + let api: Api = Api::namespaced(client.client.clone(), test_ns); + let _ = api.delete(&cm_name, &DeleteParams::default()).await; + } + + /// Example integration test for apply() updating an existing resource. + #[tokio::test] + #[ignore = "requires kubernetes cluster"] + async fn apply_updates_existing_configmap() { + let client = K8sClient::try_default() + .await + .expect("failed to create client"); + let test_ns = "default"; + let cm_name = format!( + "test-cm-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + ); + + // Create initial ConfigMap + let mut data = BTreeMap::new(); + data.insert("key1".to_string(), "value1".to_string()); + let configmap = ConfigMap { + metadata: ObjectMeta { + name: Some(cm_name.clone()), + namespace: Some(test_ns.to_string()), + ..Default::default() + }, + data: Some(data.clone()), + ..Default::default() + }; + + client.apply(&configmap, Some(test_ns)).await.unwrap(); + + // Update the ConfigMap + data.insert("key2".to_string(), "value2".to_string()); + let updated_cm = ConfigMap { + metadata: ObjectMeta { + name: Some(cm_name.clone()), + namespace: Some(test_ns.to_string()), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }; + + let result = client.apply(&updated_cm, Some(test_ns)).await; + assert!( + result.is_ok(), + "failed to update ConfigMap: {:?}", + result.err() + ); + + // Verify the update + let fetched: Option = + client.get_resource(&cm_name, Some(test_ns)).await.unwrap(); + let fetched_data = fetched.unwrap().data.unwrap(); + assert_eq!(fetched_data.get("key1").unwrap(), "value1"); + assert_eq!(fetched_data.get("key2").unwrap(), "value2"); + + // Cleanup + let api: Api = Api::namespaced(client.client.clone(), test_ns); + let _ = api.delete(&cm_name, &DeleteParams::default()).await; + } + + /// Example integration test for apply_dynamic() with new resource. + #[tokio::test] + #[ignore = "requires kubernetes cluster"] + async fn apply_dynamic_creates_new_resource() { + let client = K8sClient::try_default() + .await + .expect("failed to create client"); + let test_ns = "default"; + let cm_name = format!( + "test-dyn-cm-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + ); + + let mut data = BTreeMap::new(); + data.insert("foo".to_string(), serde_json::json!("bar")); + + let dynamic_obj = DynamicObject { + types: Some(TypeMeta { + api_version: "v1".to_string(), + kind: "ConfigMap".to_string(), + }), + metadata: ObjectMeta { + name: Some(cm_name.clone()), + namespace: Some(test_ns.to_string()), + ..Default::default() + }, + data: serde_json::json!(data), + }; + + let result = client + .apply_dynamic(&dynamic_obj, Some(test_ns), false) + .await; + assert!( + result.is_ok(), + "failed to apply dynamic object: {:?}", + result.err() + ); + + // Verify it exists + let api: Api = Api::namespaced(client.client.clone(), test_ns); + let fetched = api.get_opt(&cm_name).await.unwrap(); + assert!(fetched.is_some(), "Dynamic resource was not created"); + + // Cleanup + let _ = api.delete(&cm_name, &DeleteParams::default()).await; + } + + /// Example showing idempotency: applying same resource twice should succeed. + #[tokio::test] + #[ignore = "requires kubernetes cluster"] + async fn apply_is_idempotent() { + let client = K8sClient::try_default() + .await + .expect("failed to create client"); + let test_ns = "default"; + let cm_name = format!( + "test-idem-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + ); + + let configmap = ConfigMap { + metadata: ObjectMeta { + name: Some(cm_name.clone()), + namespace: Some(test_ns.to_string()), + ..Default::default() + }, + data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])), + ..Default::default() + }; + + // Apply twice + let result1 = client.apply(&configmap, Some(test_ns)).await; + let result2 = client.apply(&configmap, Some(test_ns)).await; + + assert!(result1.is_ok(), "first apply failed"); + assert!(result2.is_ok(), "second apply failed (not idempotent)"); + + // Cleanup + let api: Api = Api::namespaced(client.client.clone(), test_ns); + let _ = api.delete(&cm_name, &DeleteParams::default()).await; + } +} diff --git a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs index a188a922..55091d23 100644 --- a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs @@ -3,18 +3,12 @@ use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration}; use async_trait::async_trait; use base64::{Engine, engine::general_purpose}; use harmony_types::rfc1123::Rfc1123Name; -use k8s_openapi::{ - ByteString, - api::{ - core::v1::{Pod, Secret}, - rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, - }, +use k8s_openapi::api::{ + core::v1::{Pod, Secret}, + rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, }; -use kube::{ - api::{DynamicObject, GroupVersionKind, ObjectMeta}, - runtime::conditions, -}; -use log::{debug, error, info, trace, warn}; +use kube::api::{DynamicObject, GroupVersionKind, ObjectMeta}; +use log::{debug, info, trace, warn}; use serde::Serialize; use tokio::sync::OnceCell; @@ -34,10 +28,7 @@ use crate::{ score_cert_management::CertificateManagementScore, }, k3d::K3DInstallationScore, - k8s::{ - ingress::{K8sIngressScore, PathType}, - resource::K8sResourceScore, - }, + k8s::ingress::{K8sIngressScore, PathType}, monitoring::{ grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score}, kube_prometheus::crd::{ @@ -54,7 +45,6 @@ use crate::{ service_monitor::ServiceMonitor, }, }, - nats::capability::NatsCluster, okd::{crd::ingresses_config::Ingress as IngressResource, route::OKDTlsPassthroughScore}, prometheus::{ k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, @@ -103,7 +93,6 @@ enum K8sSource { pub struct K8sAnywhereTopology { k8s_state: Arc>>, tenant_manager: Arc>, - k8s_distribution: Arc>, config: Arc, } @@ -554,7 +543,6 @@ impl K8sAnywhereTopology { Self { k8s_state: Arc::new(OnceCell::new()), tenant_manager: Arc::new(OnceCell::new()), - k8s_distribution: Arc::new(OnceCell::new()), config: Arc::new(K8sAnywhereConfig::from_env()), } } @@ -563,7 +551,6 @@ impl K8sAnywhereTopology { Self { k8s_state: Arc::new(OnceCell::new()), tenant_manager: Arc::new(OnceCell::new()), - k8s_distribution: Arc::new(OnceCell::new()), config: Arc::new(config), } } @@ -600,41 +587,6 @@ impl K8sAnywhereTopology { } } - pub async fn get_k8s_distribution(&self) -> Result<&KubernetesDistribution, PreparationError> { - self.k8s_distribution - .get_or_try_init(async || { - debug!("Trying to detect k8s distribution"); - let client = self.k8s_client().await.unwrap(); - - let discovery = client.discovery().await.map_err(|e| { - PreparationError::new(format!("Could not discover API groups: {}", e)) - })?; - - let version = client.get_apiserver_version().await.map_err(|e| { - PreparationError::new(format!("Could not get server version: {}", e)) - })?; - - // OpenShift / OKD - if discovery - .groups() - .any(|g| g.name() == "project.openshift.io") - { - info!("Found KubernetesDistribution OpenshiftFamily"); - return Ok(KubernetesDistribution::OpenshiftFamily); - } - - // K3d / K3s - if version.git_version.contains("k3s") { - info!("Found KubernetesDistribution K3sFamily"); - return Ok(KubernetesDistribution::K3sFamily); - } - - info!("Could not identify KubernetesDistribution, using Default"); - return Ok(KubernetesDistribution::Default); - }) - .await - } - fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option { let token_b64 = secret .data @@ -652,6 +604,16 @@ impl K8sAnywhereTopology { Some(cleaned) } + pub async fn get_k8s_distribution(&self) -> Result { + self.k8s_client() + .await? + .get_k8s_distribution() + .await + .map_err(|e| { + PreparationError::new(format!("Failed to get k8s distribution from client : {e}")) + }) + } + pub fn build_cluster_rolebinding( &self, service_account_name: &str, diff --git a/harmony/src/domain/topology/load_balancer.rs b/harmony/src/domain/topology/load_balancer.rs index 59c5adda..3f593458 100644 --- a/harmony/src/domain/topology/load_balancer.rs +++ b/harmony/src/domain/topology/load_balancer.rs @@ -1,7 +1,6 @@ use std::{net::SocketAddr, str::FromStr}; use async_trait::async_trait; -use log::debug; use serde::Serialize; use super::LogicalHost; diff --git a/harmony/src/domain/topology/network.rs b/harmony/src/domain/topology/network.rs index ad2efe01..1ec3cf3c 100644 --- a/harmony/src/domain/topology/network.rs +++ b/harmony/src/domain/topology/network.rs @@ -188,6 +188,10 @@ impl FromStr for DnsRecordType { pub trait NetworkManager: Debug + Send + Sync { async fn ensure_network_manager_installed(&self) -> Result<(), NetworkError>; async fn configure_bond(&self, config: &HostNetworkConfig) -> Result<(), NetworkError>; + async fn configure_bond_on_primary_interface( + &self, + config: &HostNetworkConfig, + ) -> Result<(), NetworkError>; } #[derive(Debug, Clone, new)] diff --git a/harmony/src/infra/network_manager.rs b/harmony/src/infra/network_manager.rs index a5a2f77e..6b7a3428 100644 --- a/harmony/src/infra/network_manager.rs +++ b/harmony/src/infra/network_manager.rs @@ -3,6 +3,7 @@ use std::{ sync::Arc, }; +use askama::Template; use async_trait::async_trait; use harmony_types::id::Id; use k8s_openapi::api::core::v1::Node; @@ -10,13 +11,71 @@ use kube::{ ResourceExt, api::{ObjectList, ObjectMeta}, }; -use log::{debug, info}; +use log::{debug, info, warn}; use crate::{ modules::okd::crd::nmstate, - topology::{HostNetworkConfig, NetworkError, NetworkManager, k8s::K8sClient}, + topology::{ + HostNetworkConfig, NetworkError, NetworkManager, + k8s::{DrainOptions, K8sClient, NodeFile}, + }, }; +/// NetworkManager bond configuration template +#[derive(Template)] +#[template( + source = r#"[connection] +id={{ bond_name }} +uuid={{ bond_uuid }} +type=bond +autoconnect-slaves=1 +interface-name={{ bond_name }} + +[bond] +lacp_rate=fast +mode=802.3ad +xmit_hash_policy=layer2 + +[ipv4] +method=auto + +[ipv6] +addr-gen-mode=default +method=auto + +[proxy] +"#, + ext = "txt" +)] +struct BondConfigTemplate { + bond_name: String, + bond_uuid: String, +} + +/// NetworkManager bond slave configuration template +#[derive(Template)] +#[template( + source = r#"[connection] +id={{ slave_id }} +uuid={{ slave_uuid }} +type=ethernet +interface-name={{ interface_name }} +master={{ bond_name }} +slave-type=bond + +[ethernet] + +[bond-port] +"#, + ext = "txt" +)] +struct BondSlaveConfigTemplate { + slave_id: String, + slave_uuid: String, + interface_name: String, + bond_name: String, +} + /// TODO document properly the non-intuitive behavior or "roll forward only" of nmstate in general /// It is documented in nmstate official doc, but worth mentionning here : /// @@ -87,6 +146,117 @@ impl NetworkManager for OpenShiftNmStateNetworkManager { Ok(()) } + /// Configures bonding on the primary network interface of a node. + /// + /// Changing the *primary* network interface (making it a bond + /// slave) will disrupt node connectivity mid-change, so the + /// procedure is: + /// + /// 1. Generate NetworkManager .nmconnection files + /// 2. Drain the node (includes cordon) + /// 3. Write configuration files to `/etc/NetworkManager/system-connections/` + /// 4. Attempt to reload NetworkManager (optional, best-effort) + /// 5. Reboot the node with full verification (drain, boot_id check, uncordon) + /// + /// The reboot procedure includes: + /// - Recording boot_id before reboot + /// - Fire-and-forget reboot command + /// - Waiting for NotReady status + /// - Waiting for Ready status + /// - Verifying boot_id changed + /// - Uncordoning the node + /// + /// See ADR-019 for context and rationale. + async fn configure_bond_on_primary_interface( + &self, + config: &HostNetworkConfig, + ) -> Result<(), NetworkError> { + use std::time::Duration; + + let node_name = self.get_node_name_for_id(&config.host_id).await?; + let hostname = self.get_hostname(&config.host_id).await?; + + info!( + "Configuring bond on primary interface for host '{}' (node '{}')", + config.host_id, node_name + ); + + // 1. Generate .nmconnection files + let files = self.generate_nmconnection_files(&hostname, config)?; + debug!( + "Generated {} NetworkManager configuration files", + files.len() + ); + + // 2. Write configuration files to the node (before draining) + // We do this while the node is still running for faster operation + info!( + "Writing NetworkManager configuration files to node '{}'...", + node_name + ); + self.k8s_client + .write_files_to_node(&node_name, &files) + .await + .map_err(|e| { + NetworkError::new(format!( + "Failed to write configuration files to node '{}': {}", + node_name, e + )) + })?; + + // 3. Reload NetworkManager configuration (best-effort) + // This won't activate the bond yet since the primary interface would lose connectivity, + // but it validates the configuration files are correct + info!( + "Reloading NetworkManager configuration on node '{}'...", + node_name + ); + match self + .k8s_client + .run_privileged_command_on_node(&node_name, "chroot /host nmcli connection reload") + .await + { + Ok(output) => { + debug!("NetworkManager reload output: {}", output.trim()); + } + Err(e) => { + warn!( + "Failed to reload NetworkManager configuration: {}. Proceeding with reboot.", + e + ); + // Don't fail here - reboot will pick up the config anyway + } + } + + // 4. Reboot the node with full verification + // The reboot_node function handles: drain, boot_id capture, reboot, NotReady wait, + // Ready wait, boot_id verification, and uncordon + // 60 minutes timeout for bare-metal environments (drain can take 20-30 mins) + let reboot_timeout = Duration::from_secs(3600); + info!( + "Rebooting node '{}' to apply network configuration (timeout: {:?})...", + node_name, reboot_timeout + ); + + self.k8s_client + .reboot_node( + &node_name, + &DrainOptions::default_ignore_daemonset_delete_emptydir_data(), + reboot_timeout, + ) + .await + .map_err(|e| { + NetworkError::new(format!("Failed to reboot node '{}': {}", node_name, e)) + })?; + + info!( + "Successfully configured bond on primary interface for host '{}' (node '{}')", + config.host_id, node_name + ); + + Ok(()) + } + async fn configure_bond(&self, config: &HostNetworkConfig) -> Result<(), NetworkError> { let hostname = self.get_hostname(&config.host_id).await.map_err(|e| { NetworkError::new(format!( @@ -208,14 +378,14 @@ impl OpenShiftNmStateNetworkManager { } } - async fn get_hostname(&self, host_id: &Id) -> Result { + async fn get_node_for_id(&self, host_id: &Id) -> Result { let nodes: ObjectList = self .k8s_client .list_resources(None, None) .await .map_err(|e| format!("Failed to list nodes: {e}"))?; - let Some(node) = nodes.iter().find(|n| { + let Some(node) = nodes.into_iter().find(|n| { n.status .as_ref() .and_then(|s| s.node_info.as_ref()) @@ -225,6 +395,20 @@ impl OpenShiftNmStateNetworkManager { return Err(format!("No node found for host '{host_id}'")); }; + Ok(node) + } + + async fn get_node_name_for_id(&self, host_id: &Id) -> Result { + let node = self.get_node_for_id(host_id).await?; + + node.metadata.name.ok_or(format!( + "A node should always have a name, node for host_id {host_id} has no name" + )) + } + + async fn get_hostname(&self, host_id: &Id) -> Result { + let node = self.get_node_for_id(host_id).await?; + node.labels() .get("kubernetes.io/hostname") .ok_or(format!( @@ -261,4 +445,82 @@ impl OpenShiftNmStateNetworkManager { let next_id = (0..).find(|id| !used_ids.contains(id)).unwrap(); Ok(format!("bond{next_id}")) } + + /// Generates NetworkManager .nmconnection files for bonding configuration. + /// + /// Creates: + /// - One bond master configuration file (bond0.nmconnection) + /// - One slave configuration file per interface (bond0-.nmconnection) + /// + /// All files are placed in `/etc/NetworkManager/system-connections/` with + /// mode 0o600 (required by NetworkManager). + fn generate_nmconnection_files( + &self, + hostname: &str, + config: &HostNetworkConfig, + ) -> Result, NetworkError> { + let mut files = Vec::new(); + let bond_name = "bond0"; + let bond_uuid = uuid::Uuid::new_v4().to_string(); + + // Generate bond master configuration + let bond_template = BondConfigTemplate { + bond_name: bond_name.to_string(), + bond_uuid: bond_uuid.clone(), + }; + + let bond_content = bond_template.render().map_err(|e| { + NetworkError::new(format!( + "Failed to render bond configuration template: {}", + e + )) + })?; + + files.push(NodeFile { + path: format!( + "/etc/NetworkManager/system-connections/{}.nmconnection", + bond_name + ), + content: bond_content, + mode: 0o600, + }); + + // Generate slave configurations for each interface + for switch_port in &config.switch_ports { + let interface_name = &switch_port.interface.name; + let slave_id = format!("{}-{}", bond_name, interface_name); + let slave_uuid = uuid::Uuid::new_v4().to_string(); + + let slave_template = BondSlaveConfigTemplate { + slave_id: slave_id.clone(), + slave_uuid, + interface_name: interface_name.clone(), + bond_name: bond_name.to_string(), + }; + + let slave_content = slave_template.render().map_err(|e| { + NetworkError::new(format!( + "Failed to render slave configuration template for interface '{}': {}", + interface_name, e + )) + })?; + + files.push(NodeFile { + path: format!( + "/etc/NetworkManager/system-connections/{}.nmconnection", + slave_id + ), + content: slave_content, + mode: 0o600, + }); + } + + debug!( + "Generated {} NetworkManager configuration files for host '{}'", + files.len(), + hostname + ); + + Ok(files) + } } diff --git a/harmony/src/modules/okd/host_network.rs b/harmony/src/modules/okd/host_network.rs index 2a1bdace..9bb65a33 100644 --- a/harmony/src/modules/okd/host_network.rs +++ b/harmony/src/modules/okd/host_network.rs @@ -142,9 +142,13 @@ impl HostNetworkConfigurationInterpret { ); info!("[Host {current_host}/{total_hosts}] Configuring host network..."); - topology.configure_bond(&config).await.map_err(|e| { - InterpretError::new(format!("Failed to configure host network: {e}")) - })?; + topology + .configure_bond_on_primary_interface(&config) + .await + .map_err(|e| { + InterpretError::new(format!("Failed to configure host network: {e}")) + })?; + topology .configure_port_channel(&config) .await @@ -731,6 +735,16 @@ mod tests { Ok(()) } + + async fn configure_bond_on_primary_interface( + &self, + config: &HostNetworkConfig, + ) -> Result<(), NetworkError> { + let mut configured_bonds = self.configured_bonds.lock().unwrap(); + configured_bonds.push((config.host_id.clone(), config.clone())); + + Ok(()) + } } #[async_trait] diff --git a/opnsense-config/src/modules/dnsmasq.rs b/opnsense-config/src/modules/dnsmasq.rs index c762f82e..dff4de1c 100644 --- a/opnsense-config/src/modules/dnsmasq.rs +++ b/opnsense-config/src/modules/dnsmasq.rs @@ -68,7 +68,7 @@ impl<'a> DhcpConfigDnsMasq<'a> { /// /// This function implements specific logic to handle existing entries: /// - If no host exists for the given IP or hostname, a new entry is created. - /// - If exactly one host exists for the IP and/or hostname, the new MAC is appended to it. + /// - If exactly one host exists for the IP and/or hostname, the new MAC is set. Old MAC addresses are dropped. /// - It will error if the IP and hostname exist but point to two different host entries, /// as this represents an unresolvable conflict. /// - It will also error if multiple entries are found for the IP or hostname, indicating an @@ -146,40 +146,24 @@ impl<'a> DhcpConfigDnsMasq<'a> { let host_to_modify_ip = host_to_modify.ip.content_string(); if host_to_modify_ip != ip_str { warn!( - "Hostname '{}' already exists with a different IP ({}). Setting new IP {ip_str}. Appending MAC {}.", - hostname, host_to_modify_ip, mac_list + "Hostname '{}' already exists with a different IP ({}). Setting new IP {ip_str}.", + hostname, host_to_modify_ip, ); host_to_modify.ip.content = Some(ip_str); } else if host_to_modify.host != hostname { warn!( - "IP {} already exists with a different hostname ('{}'). Setting hostname to {hostname}. Appending MAC {}.", - ipaddr, host_to_modify.host, mac_list + "IP {} already exists with a different hostname ('{}'). Setting hostname to {hostname}", + ipaddr, host_to_modify.host ); host_to_modify.host = hostname.to_string(); } - for single_mac in mac.iter() { - if !host_to_modify - .hwaddr - .content_string() - .split(',') - .any(|m| m.eq_ignore_ascii_case(single_mac)) - { - info!( - "Appending MAC {} to existing static host for {} ({})", - single_mac, host_to_modify.host, host_to_modify_ip - ); - let mut updated_macs = host_to_modify.hwaddr.content_string().to_string(); - updated_macs.push(','); - updated_macs.push_str(single_mac); - host_to_modify.hwaddr.content = updated_macs.into(); - } else { - debug!( - "MAC {} already present in static host entry for {} ({}). No changes made.", - single_mac, host_to_modify.host, host_to_modify_ip - ); - } - } + info!( + "Replacing previous mac adresses {:?} with new {}", + host_to_modify.hwaddr, mac_list + ); + + host_to_modify.hwaddr.content = Some(mac_list); } _ => { return Err(DhcpError::Configuration(format!( @@ -397,7 +381,7 @@ mod test { } #[test] - fn test_add_mac_to_existing_host_by_ip_and_hostname() { + fn test_replace_mac_on_existing_host_by_ip_and_hostname() { let initial_host = create_host( "uuid-1", "existing-host", @@ -416,14 +400,11 @@ mod test { let hosts = &dhcp_config.opnsense.dnsmasq.as_ref().unwrap().hosts; assert_eq!(hosts.len(), 1); let host = &hosts[0]; - assert_eq!( - host.hwaddr.content_string(), - "AA:BB:CC:DD:EE:FF,00:11:22:33:44:55" - ); + assert_eq!(host.hwaddr.content_string(), "00:11:22:33:44:55"); } #[test] - fn test_add_mac_to_existing_host_by_ip_only() { + fn test_replace_mac_on_existing_host_by_ip_only() { let initial_host = create_host( "uuid-1", "existing-host", @@ -443,10 +424,7 @@ mod test { let hosts = &dhcp_config.opnsense.dnsmasq.as_ref().unwrap().hosts; assert_eq!(hosts.len(), 1); let host = &hosts[0]; - assert_eq!( - host.hwaddr.content_string(), - "AA:BB:CC:DD:EE:FF,00:11:22:33:44:55" - ); + assert_eq!(host.hwaddr.content_string(), "00:11:22:33:44:55"); assert_eq!(host.host, new_hostname); // hostname should be updated } @@ -474,10 +452,7 @@ mod test { let hosts = &dhcp_config.opnsense.dnsmasq.as_ref().unwrap().hosts; assert_eq!(hosts.len(), 1); let host = &hosts[0]; - assert_eq!( - host.hwaddr.content_string(), - "AA:BB:CC:DD:EE:FF,00:11:22:33:44:55" - ); + assert_eq!(host.hwaddr.content_string(), "00:11:22:33:44:55"); assert_eq!(host.ip.content_string(), "192.168.1.99"); // Original IP should be preserved. }