Compare commits

..

16 Commits

Author SHA1 Message Date
f463cd1e94 Fix merge conflict between master and refactor/k8sclient
All checks were successful
Run Check Script / check (pull_request) Successful in 1m28s
2026-03-07 17:56:26 -05:00
0e9b23a320 Merge branch 'feat/change-node-readiness-strategy'
Some checks failed
Run Check Script / check (push) Successful in 1m26s
Compile and package harmony_composer / package_harmony_composer (push) Failing after 2m11s
2026-03-07 16:35:14 -05:00
f532ba2b40 doc: Update node readiness readme and deployed port to 25001
All checks were successful
Run Check Script / check (pull_request) Successful in 1m27s
2026-03-07 16:33:28 -05:00
fafca31798 fix: formatting and check script
All checks were successful
Run Check Script / check (pull_request) Successful in 1m28s
2026-03-07 16:08:52 -05:00
5412c34957 Merge pull request 'fix: change vlan definition from MaybeString to RawXml' (#245) from feat/opnsense-config-xml-support-vlan into master
Some checks failed
Run Check Script / check (push) Successful in 1m47s
Compile and package harmony_composer / package_harmony_composer (push) Failing after 2m7s
Reviewed-on: #245
2026-03-07 20:59:28 +00:00
787cc8feab Fix doc tests for harmony-k8s crate refactoring
All checks were successful
Run Check Script / check (pull_request) Successful in 2m6s
- Updated harmony-k8s doc tests to import from harmony_k8s instead of harmony
- Changed CloudNativePgOperatorScore::default() to default_openshift()

This ensures doc tests work correctly after moving K8sClient to the harmony-k8s crate.
2026-03-07 15:50:39 -05:00
ce041f495b fix(zitadel): include admin@zitadel.{host} username, secure password with symbol/number, and cert-manager TLS configuration
Some checks failed
Run Check Script / check (pull_request) Failing after 26s
Update Zitadel deployment to use correct username format (admin@zitadel.{host}), generate secure passwords with required complexity (uppercase, lowercase, digit, symbol), configure edge TLS termination for OpenShift, and add cert-manager annotations. Also refactor password generation to ensure all complexity requirements are met.
2026-03-07 15:29:26 -05:00
55de206523 fix: change vlan definition from MaybeString to RawXml
All checks were successful
Run Check Script / check (pull_request) Successful in 1m29s
2026-03-07 10:03:03 -05:00
64893a84f5 fix(node health endpoint): Setup sane timeouts for usage as a load balancer health check. The default k8s client timeout of 30 seconds caused haproxy health check to fail even though we still returned 200OK after 30 seconds
Some checks failed
Run Check Script / check (pull_request) Failing after 25s
2026-03-06 16:28:13 -05:00
f941672662 fix: Node readiness always fails open when kube api call fails on note status check
Some checks failed
Run Check Script / check (pull_request) Failing after 1m54s
2026-03-06 15:45:38 -05:00
a98113dd40 wip: zitadel ingress https not working yet
Some checks failed
Run Check Script / check (pull_request) Failing after 28s
2026-03-06 15:28:21 -05:00
5db1a31d33 ... 2026-03-06 15:24:33 -05:00
f5aac67af8 feat: k8s client works fine, added version config in zitadel and fix master key secret existence handling
Some checks failed
Run Check Script / check (pull_request) Failing after 32s
2026-03-06 15:15:35 -05:00
d7e5bf11d5 removing bad stuff I did this morning and trying to make it simple, and adding a couple tests 2026-03-06 14:41:08 -05:00
2e1f1b8447 feat: Refactor K8sClient into separate, publishable crate, and add zitadel example 2026-03-06 14:21:15 -05:00
2b157ad7fd feat: add a background loop checking the node status every X seconds. If NotReady for Y seconds, kill the router pod if there's one 2026-03-06 11:57:39 -05:00
64 changed files with 3739 additions and 4221 deletions

2493
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,6 @@
resolver = "2"
members = [
"private_repos/*",
"examples/*",
"harmony",
"harmony_types",
"harmony_macros",
@@ -17,9 +16,9 @@ members = [
"harmony_secret_derive",
"harmony_secret",
"adr/agent_discovery/mdns",
"brocade",
"harmony_agent",
"harmony_agent/deploy", "harmony_node_readiness",
"brocade",
"harmony_agent",
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
]
[workspace.package]
@@ -38,6 +37,8 @@ tokio = { version = "1.40", features = [
"macros",
"rt-multi-thread",
] }
tokio-retry = "0.3.0"
tokio-util = "0.7.15"
cidr = { features = ["serde"], version = "0.2" }
russh = "0.45"
russh-keys = "0.45"

View File

@@ -1,8 +1,7 @@
use super::BrocadeClient;
use crate::{
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
shell::BrocadeShell,
PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
};
use async_trait::async_trait;

View File

@@ -8,7 +8,7 @@ use regex::Regex;
use crate::{
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
parse_brocade_mac_address, shell::BrocadeShell,
};
#[derive(Debug)]

View File

@@ -1,8 +1,8 @@
use harmony::{
inventory::Inventory,
modules::cert_manager::{
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
capability::CertificateManagementConfig, score_certificate::CertificateScore,
score_issuer::CertificateIssuerScore,
},
topology::K8sAnywhereTopology,
};

View File

@@ -10,9 +10,10 @@ publish = false
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
harmony-k8s = { path = "../../harmony-k8s" }
cidr.workspace = true
tokio.workspace = true
harmony_macros = { path = "../../harmony_macros" }
log.workspace = true
env_logger.workspace = true
url.workspace = true

View File

@@ -1,6 +1,6 @@
use std::time::Duration;
use harmony::topology::k8s::{DrainOptions, K8sClient};
use harmony_k8s::{DrainOptions, K8sClient};
use log::{info, trace};
#[tokio::main]

View File

@@ -10,9 +10,10 @@ publish = false
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
harmony-k8s = { path = "../../harmony-k8s" }
cidr.workspace = true
tokio.workspace = true
harmony_macros = { path = "../../harmony_macros" }
log.workspace = true
env_logger.workspace = true
url.workspace = true

View File

@@ -1,4 +1,4 @@
use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile};
use harmony_k8s::{K8sClient, NodeFile};
use log::{info, trace};
#[tokio::main]

View File

@@ -5,7 +5,7 @@ use harmony::{
#[tokio::main]
async fn main() {
let openbao = OpenbaoScore {
host: String::new(),
host: "openbao.sebastien.sto1.nationtech.io".to_string(),
};
harmony_cli::run(

View File

@@ -1,5 +1,3 @@
use std::str::FromStr;
use harmony::{
inventory::Inventory,
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
@@ -9,7 +7,7 @@ use harmony::{
#[tokio::main]
async fn main() {
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
let cnpg_operator = CloudNativePgOperatorScore::default();
let cnpg_operator = CloudNativePgOperatorScore::default_openshift();
harmony_cli::run(
Inventory::autoload(),

View File

@@ -1,22 +1,13 @@
use std::{
net::{IpAddr, Ipv4Addr},
sync::Arc,
};
use std::sync::Arc;
use async_trait::async_trait;
use cidr::Ipv4Cidr;
use harmony::{
executors::ExecutorError,
hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
infra::opnsense::OPNSenseManagementInterface,
inventory::Inventory,
modules::opnsense::node_exporter::NodeExporterScore,
topology::{
HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
UnmanagedRouter, node_exporter::NodeExporter,
},
topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
};
use harmony_macros::{ip, ipv4, mac_address};
use harmony_macros::ip;
#[derive(Debug)]
struct OpnSenseTopology {

View File

@@ -1,8 +1,7 @@
use harmony::{
inventory::Inventory,
modules::postgresql::{
K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
capability::PostgreSQLConfig,
PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
},
topology::K8sAnywhereTopology,
};

View File

@@ -1,4 +1,4 @@
use std::{collections::HashMap, path::PathBuf, sync::Arc};
use std::{path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,

View File

@@ -1,4 +1,4 @@
use std::{collections::HashMap, path::PathBuf, sync::Arc};
use std::{path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,

View File

@@ -0,0 +1,14 @@
[package]
name = "example-zitadel"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_macros = { path = "../../harmony_macros" }
harmony_types = { path = "../../harmony_types" }
tokio.workspace = true
url.workspace = true

View File

@@ -0,0 +1,20 @@
use harmony::{
inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
};
#[tokio::main]
async fn main() {
let zitadel = ZitadelScore {
host: "sso.sto1.nationtech.io".to_string(),
zitadel_version: "v4.12.1".to_string(),
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(zitadel)],
None,
)
.await
.unwrap();
}

Binary file not shown.

23
harmony-k8s/Cargo.toml Normal file
View File

@@ -0,0 +1,23 @@
[package]
name = "harmony-k8s"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
kube.workspace = true
k8s-openapi.workspace = true
tokio.workspace = true
tokio-retry.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_yaml.workspace = true
log.workspace = true
similar.workspace = true
reqwest.workspace = true
url.workspace = true
inquire.workspace = true
[dev-dependencies]
pretty_assertions.workspace = true

593
harmony-k8s/src/apply.rs Normal file
View File

@@ -0,0 +1,593 @@
use kube::{
Client, Error, Resource,
api::{
Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
ResourceExt,
},
core::ErrorResponse,
discovery::Scope,
error::DiscoveryError,
};
use log::{debug, error, trace, warn};
use serde::{Serialize, de::DeserializeOwned};
use serde_json::Value;
use similar::TextDiff;
use url::Url;
use crate::client::K8sClient;
use crate::helper;
use crate::types::WriteMode;
/// The field-manager token sent with every server-side apply request.
pub const FIELD_MANAGER: &str = "harmony-k8s";
// ── Private helpers ──────────────────────────────────────────────────────────
/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
.map_err(Error::SerdeError)
}
/// Fetch the current resource, display a unified diff against `payload`, and
/// return `()`. All output goes to stdout (same behaviour as before).
///
/// A 404 is treated as "resource would be created" — not an error.
async fn show_dry_run<T: Serialize>(
api: &Api<DynamicObject>,
name: &str,
payload: &T,
) -> Result<(), Error> {
let new_yaml = serde_yaml::to_string(payload)
.unwrap_or_else(|_| "Failed to serialize new resource".to_string());
match api.get(name).await {
Ok(current) => {
println!("\nDry-run for resource: '{name}'");
let mut current_val = serde_yaml::to_value(&current).unwrap_or(serde_yaml::Value::Null);
if let Some(map) = current_val.as_mapping_mut() {
map.remove(&serde_yaml::Value::String("status".to_string()));
}
let current_yaml = serde_yaml::to_string(&current_val)
.unwrap_or_else(|_| "Failed to serialize current resource".to_string());
if current_yaml == new_yaml {
println!("No changes detected.");
} else {
println!("Changes detected:");
let diff = TextDiff::from_lines(&current_yaml, &new_yaml);
for change in diff.iter_all_changes() {
let sign = match change.tag() {
similar::ChangeTag::Delete => "-",
similar::ChangeTag::Insert => "+",
similar::ChangeTag::Equal => " ",
};
print!("{sign}{change}");
}
}
Ok(())
}
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
println!("\nDry-run for new resource: '{name}'");
println!("Resource does not exist. Would be created:");
for line in new_yaml.lines() {
println!("+{line}");
}
Ok(())
}
Err(e) => {
error!("Failed to fetch resource '{name}' for dry-run: {e}");
Err(e)
}
}
}
/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
async fn do_apply<T: Serialize + std::fmt::Debug>(
api: &Api<DynamicObject>,
name: &str,
payload: &T,
patch_params: &PatchParams,
write_mode: &WriteMode,
) -> Result<DynamicObject, Error> {
match write_mode {
WriteMode::CreateOrUpdate => {
// TODO refactor this arm to perform self.update and if fail with 404 self.create
// This will avoid the repetition of the api.patch and api.create calls within this
// function body. This makes the code more maintainable
match api.patch(name, patch_params, &Patch::Apply(payload)).await {
Ok(obj) => Ok(obj),
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
debug!("Resource '{name}' not found via SSA, falling back to POST");
let dyn_obj = to_dynamic(payload)?;
api.create(&PostParams::default(), &dyn_obj)
.await
.map_err(|e| {
error!("Failed to create '{name}': {e}");
e
})
}
Err(e) => {
error!("Failed to apply '{name}': {e}");
Err(e)
}
}
}
WriteMode::Create => {
let dyn_obj = to_dynamic(payload)?;
api.create(&PostParams::default(), &dyn_obj)
.await
.map_err(|e| {
error!("Failed to create '{name}': {e}");
e
})
}
WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
Ok(obj) => Ok(obj),
Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
code: 404,
message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
reason: "NotFound".to_string(),
status: "Failure".to_string(),
})),
Err(e) => {
error!("Failed to update '{name}': {e}");
Err(e)
}
},
}
}
// ── Public API ───────────────────────────────────────────────────────────────
impl K8sClient {
/// Server-side apply: create if absent, update if present.
/// Equivalent to `kubectl apply`.
pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
.await
}
/// POST only — returns an error if the resource already exists.
pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::Create)
.await
}
/// Server-side apply only — returns an error if the resource does not exist.
pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::Update)
.await
}
pub async fn apply_with_strategy<K>(
&self,
resource: &K,
namespace: Option<&str>,
write_mode: WriteMode,
) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
debug!(
"apply_with_strategy: {:?} ns={:?}",
resource.meta().name,
namespace
);
trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
let dyntype = K::DynamicType::default();
let gvk = GroupVersionKind {
group: K::group(&dyntype).to_string(),
version: K::version(&dyntype).to_string(),
kind: K::kind(&dyntype).to_string(),
};
let discovery = self.discovery().await?;
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Cannot resolve GVK: {gvk:?}"
)))
})?;
let effective_ns = if caps.scope == Scope::Cluster {
None
} else {
namespace.or_else(|| resource.meta().namespace.as_deref())
};
let api: Api<DynamicObject> =
get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
let name = resource
.meta()
.name
.as_deref()
.expect("Kubernetes resource must have a name");
if self.dry_run {
show_dry_run(&api, name, resource).await?;
return Ok(resource.clone());
}
let patch_params = PatchParams::apply(FIELD_MANAGER);
do_apply(&api, name, resource, &patch_params, &write_mode)
.await
.and_then(helper::dyn_to_typed)
}
/// Applies resources in order, one at a time
pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
let mut result = Vec::new();
for r in resources.iter() {
let res = self.apply(r, ns).await;
if res.is_err() {
// NOTE: this may log sensitive data; downgrade to debug if needed.
warn!(
"Failed to apply k8s resource: {}",
serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
);
}
result.push(res?);
}
Ok(result)
}
/// Apply a [`DynamicObject`] resource using server-side apply.
pub async fn apply_dynamic(
&self,
resource: &DynamicObject,
namespace: Option<&str>,
force_conflicts: bool,
) -> Result<DynamicObject, Error> {
trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
let discovery = self.discovery().await?;
let type_meta = resource.types.as_ref().ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(
"DynamicObject must have types (apiVersion and kind)".to_string(),
))
})?;
let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
Error::BuildRequest(kube::core::request::Error::Validation(format!(
"Invalid GVK in DynamicObject: {type_meta:?}"
)))
})?;
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Cannot resolve GVK: {gvk:?}"
)))
})?;
let effective_ns = if caps.scope == Scope::Cluster {
None
} else {
namespace.or_else(|| resource.metadata.namespace.as_deref())
};
let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
let name = resource.metadata.name.as_deref().ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(
"DynamicObject must have metadata.name".to_string(),
))
})?;
debug!(
"apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
resource.types.as_ref().map(|t| &t.kind),
);
// NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
// pasting the dry_run and some more logic
if self.dry_run {
show_dry_run(&api, name, resource).await?;
return Ok(resource.clone());
}
let mut patch_params = PatchParams::apply(FIELD_MANAGER);
patch_params.force = force_conflicts;
do_apply(
&api,
name,
resource,
&patch_params,
&WriteMode::CreateOrUpdate,
)
.await
}
pub async fn apply_dynamic_many(
&self,
resources: &[DynamicObject],
namespace: Option<&str>,
force_conflicts: bool,
) -> Result<Vec<DynamicObject>, Error> {
let mut result = Vec::new();
for r in resources.iter() {
result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
}
Ok(result)
}
pub async fn apply_yaml_many(
&self,
#[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
ns: Option<&str>,
) -> Result<(), Error> {
for y in yaml.iter() {
self.apply_yaml(y, ns).await?;
}
Ok(())
}
pub async fn apply_yaml(
&self,
yaml: &serde_yaml::Value,
ns: Option<&str>,
) -> Result<(), Error> {
// NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
// apply_dynamic instead of reimplementing api interactions?
let obj: DynamicObject =
serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
let kind = yaml["kind"].as_str().expect("missing kind");
let mut it = api_version.splitn(2, '/');
let first = it.next().unwrap();
let (g, v) = match it.next() {
Some(second) => (first, second),
None => ("", first),
};
let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
let namespace = ns.unwrap_or_else(|| {
obj.metadata
.namespace
.as_deref()
.expect("YAML must have a namespace when ns is not provided")
});
let api: Api<DynamicObject> =
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
println!("Applying '{name}' in namespace '{namespace}'...");
let patch_params = PatchParams::apply(FIELD_MANAGER);
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
println!("Successfully applied '{}'.", result.name_any());
Ok(())
}
/// Equivalent to `kubectl apply -f <url>`.
pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
let patch_params = PatchParams::apply(FIELD_MANAGER);
let discovery = self.discovery().await?;
let yaml = reqwest::get(url)
.await
.expect("Could not fetch URL")
.text()
.await
.expect("Could not read response body");
for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
let obj: DynamicObject =
serde_yaml::from_value(doc).expect("YAML document is not a valid object");
let namespace = obj.metadata.namespace.as_deref().or(ns);
let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
let gvk =
GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
let name = obj.name_any();
if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
trace!(
"Applying {}:\n{}",
gvk.kind,
serde_yaml::to_string(&obj).unwrap_or_default()
);
let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
debug!("Applied {} '{name}'", gvk.kind);
} else {
warn!("Skipping document with unknown GVK: {gvk:?}");
}
}
Ok(())
}
/// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
pub(crate) fn get_api_for_dynamic_object(
&self,
object: &DynamicObject,
ns: Option<&str>,
) -> Result<Api<DynamicObject>, Error> {
let ar = object
.types
.as_ref()
.and_then(|t| {
let parts: Vec<&str> = t.api_version.split('/').collect();
match parts.as_slice() {
[version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
"", version, &t.kind,
))),
[group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
group, version, &t.kind,
))),
_ => None,
}
})
.ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(format!(
"Invalid apiVersion in DynamicObject: {object:#?}"
)))
})?;
Ok(match ns {
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
None => Api::default_namespaced_with(self.client.clone(), &ar),
})
}
}
// ── Free functions ───────────────────────────────────────────────────────────
pub(crate) fn get_dynamic_api(
resource: kube::api::ApiResource,
capabilities: kube::discovery::ApiCapabilities,
client: Client,
ns: Option<&str>,
all: bool,
) -> Api<DynamicObject> {
if capabilities.scope == Scope::Cluster || all {
Api::all_with(client, &resource)
} else if let Some(namespace) = ns {
Api::namespaced_with(client, namespace, &resource)
} else {
Api::default_namespaced_with(client, &resource)
}
}
pub(crate) fn multidoc_deserialize(
data: &str,
) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
use serde::Deserialize;
let mut docs = vec![];
for de in serde_yaml::Deserializer::from_str(data) {
docs.push(serde_yaml::Value::deserialize(de)?);
}
Ok(docs)
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod apply_tests {
use std::collections::BTreeMap;
use std::time::{SystemTime, UNIX_EPOCH};
use k8s_openapi::api::core::v1::ConfigMap;
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::api::{DeleteParams, TypeMeta};
use super::*;
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_creates_new_configmap() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-cm-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
..Default::default()
};
assert!(client.apply(&cm, Some(ns)).await.is_ok());
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_is_idempotent() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-idem-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
..Default::default()
};
assert!(
client.apply(&cm, Some(ns)).await.is_ok(),
"first apply failed"
);
assert!(
client.apply(&cm, Some(ns)).await.is_ok(),
"second apply failed (not idempotent)"
);
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_dynamic_creates_new_resource() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-dyn-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let obj = DynamicObject {
types: Some(TypeMeta {
api_version: "v1".to_string(),
kind: "ConfigMap".to_string(),
}),
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: serde_json::json!({}),
};
let result = client.apply_dynamic(&obj, Some(ns), false).await;
assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
}

View File

@@ -25,9 +25,9 @@
//!
//! ## Example
//!
//! ```rust,no_run
//! use harmony::topology::k8s::{K8sClient, helper};
//! use harmony::topology::KubernetesDistribution;
//! ```
//! use harmony_k8s::{K8sClient, helper};
//! use harmony_k8s::KubernetesDistribution;
//!
//! async fn write_network_config(client: &K8sClient, node: &str) {
//! // Create a bundle with platform-specific RBAC
@@ -56,7 +56,7 @@ use kube::{Error, Resource, ResourceExt, api::DynamicObject};
use serde::Serialize;
use serde_json;
use crate::domain::topology::k8s::K8sClient;
use crate::K8sClient;
/// A ResourceBundle represents a logical unit of work consisting of multiple
/// Kubernetes resources that should be applied or deleted together.

99
harmony-k8s/src/client.rs Normal file
View File

@@ -0,0 +1,99 @@
use std::sync::Arc;
use kube::config::{KubeConfigOptions, Kubeconfig};
use kube::{Client, Config, Discovery, Error};
use log::error;
use serde::Serialize;
use tokio::sync::OnceCell;
use crate::types::KubernetesDistribution;
// TODO not cool, should use a proper configuration mechanism
// cli arg, env var, config file
fn read_dry_run_from_env() -> bool {
std::env::var("DRY_RUN")
.map(|v| v == "true" || v == "1")
.unwrap_or(false)
}
#[derive(Clone)]
pub struct K8sClient {
pub(crate) client: Client,
/// When `true` no mutation is sent to the API server; diffs are printed
/// to stdout instead. Initialised from the `DRY_RUN` environment variable.
pub(crate) dry_run: bool,
pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
pub(crate) discovery: Arc<OnceCell<Discovery>>,
}
impl Serialize for K8sClient {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!("K8sClient serialization is not meaningful; remove this impl if unused")
}
}
impl std::fmt::Debug for K8sClient {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!(
"K8sClient {{ namespace: {}, dry_run: {} }}",
self.client.default_namespace(),
self.dry_run,
))
}
}
impl K8sClient {
/// Create a client, reading `DRY_RUN` from the environment.
pub fn new(client: Client) -> Self {
Self {
dry_run: read_dry_run_from_env(),
client,
k8s_distribution: Arc::new(OnceCell::new()),
discovery: Arc::new(OnceCell::new()),
}
}
/// Create a client that always operates in dry-run mode, regardless of
/// the environment variable.
pub fn new_dry_run(client: Client) -> Self {
Self {
dry_run: true,
..Self::new(client)
}
}
/// Returns `true` if this client is operating in dry-run mode.
pub fn is_dry_run(&self) -> bool {
self.dry_run
}
pub async fn try_default() -> Result<Self, Error> {
Ok(Self::new(Client::try_default().await?))
}
pub async fn from_kubeconfig(path: &str) -> Option<Self> {
Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
}
pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
let mut opts = KubeConfigOptions::default();
opts.context = context;
Self::from_kubeconfig_with_opts(path, &opts).await
}
pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
let k = match Kubeconfig::read_from(path) {
Ok(k) => k,
Err(e) => {
error!("Failed to load kubeconfig from {path}: {e}");
return None;
}
};
Some(Self::new(
Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
))
}
}

View File

@@ -0,0 +1,83 @@
use std::time::Duration;
use kube::{Discovery, Error};
use log::{debug, error, info, trace, warn};
use tokio::sync::Mutex;
use tokio_retry::{Retry, strategy::ExponentialBackoff};
use crate::client::K8sClient;
use crate::types::KubernetesDistribution;
impl K8sClient {
pub async fn get_apiserver_version(
&self,
) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
self.client.clone().apiserver_version().await
}
/// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
pub async fn discovery(&self) -> Result<&Discovery, Error> {
let retry_strategy = ExponentialBackoff::from_millis(1000)
.max_delay(Duration::from_secs(32))
.take(6);
let attempt = Mutex::new(0u32);
Retry::spawn(retry_strategy, || async {
let mut n = attempt.lock().await;
*n += 1;
match self
.discovery
.get_or_try_init(async || {
debug!("Running Kubernetes API discovery (attempt {})", *n);
let d = Discovery::new(self.client.clone()).run().await?;
debug!("Kubernetes API discovery completed");
Ok(d)
})
.await
{
Ok(d) => Ok(d),
Err(e) => {
warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
Err(e)
}
}
})
.await
.map_err(|e| {
error!("Kubernetes API discovery failed after all retries: {}", e);
e
})
}
/// Detect which Kubernetes distribution is running. Result is cached for
/// the lifetime of the client.
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
self.k8s_distribution
.get_or_try_init(async || {
debug!("Detecting Kubernetes distribution");
let api_groups = self.client.list_api_groups().await?;
trace!("list_api_groups: {:?}", api_groups);
let version = self.get_apiserver_version().await?;
if api_groups
.groups
.iter()
.any(|g| g.name == "project.openshift.io")
{
info!("Detected distribution: OpenshiftFamily");
return Ok(KubernetesDistribution::OpenshiftFamily);
}
if version.git_version.contains("k3s") {
info!("Detected distribution: K3sFamily");
return Ok(KubernetesDistribution::K3sFamily);
}
info!("Distribution not identified, using Default");
Ok(KubernetesDistribution::Default)
})
.await
.cloned()
}
}

View File

@@ -1,7 +1,7 @@
use std::collections::BTreeMap;
use std::time::Duration;
use crate::topology::KubernetesDistribution;
use crate::KubernetesDistribution;
use super::bundle::ResourceBundle;
use super::config::PRIVILEGED_POD_IMAGE;
@@ -133,9 +133,9 @@ pub fn host_root_volume() -> (Volume, VolumeMount) {
///
/// # Example
///
/// ```rust,no_run
/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
/// # use harmony::topology::KubernetesDistribution;
/// ```
/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
/// use harmony_k8s::KubernetesDistribution;
/// let bundle = build_privileged_bundle(
/// PrivilegedPodConfig {
/// name: "network-setup".to_string(),

13
harmony-k8s/src/lib.rs Normal file
View File

@@ -0,0 +1,13 @@
pub mod apply;
pub mod bundle;
pub mod client;
pub mod config;
pub mod discovery;
pub mod helper;
pub mod node;
pub mod pod;
pub mod resources;
pub mod types;
pub use client::K8sClient;
pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};

3
harmony-k8s/src/main.rs Normal file
View File

@@ -0,0 +1,3 @@
fn main() {
println!("Hello, world!");
}

722
harmony-k8s/src/node.rs Normal file
View File

@@ -0,0 +1,722 @@
use std::collections::BTreeMap;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use k8s_openapi::api::core::v1::{
ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::{
Error,
api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
core::ErrorResponse,
error::DiscoveryError,
};
use log::{debug, error, info, warn};
use tokio::time::sleep;
use crate::client::K8sClient;
use crate::helper::{self, PrivilegedPodConfig};
use crate::types::{DrainOptions, NodeFile};
impl K8sClient {
pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
Api::<Node>::all(self.client.clone())
.cordon(node_name)
.await?;
Ok(())
}
pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
Api::<Node>::all(self.client.clone())
.uncordon(node_name)
.await?;
Ok(())
}
pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
.await
}
async fn wait_for_node_ready_with_timeout(
&self,
node_name: &str,
timeout: Duration,
) -> Result<(), Error> {
let api: Api<Node> = Api::all(self.client.clone());
let start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
loop {
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not become Ready within {timeout:?}"
))));
}
match api.get(node_name).await {
Ok(node) => {
if node
.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|conds| {
conds
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
})
.unwrap_or(false)
{
debug!("Node '{node_name}' is Ready");
return Ok(());
}
}
Err(e) => debug!("Error polling node '{node_name}': {e}"),
}
sleep(poll).await;
}
}
async fn wait_for_node_not_ready(
&self,
node_name: &str,
timeout: Duration,
) -> Result<(), Error> {
let api: Api<Node> = Api::all(self.client.clone());
let start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
loop {
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not become NotReady within {timeout:?}"
))));
}
match api.get(node_name).await {
Ok(node) => {
let is_ready = node
.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|conds| {
conds
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
})
.unwrap_or(false);
if !is_ready {
debug!("Node '{node_name}' is NotReady");
return Ok(());
}
}
Err(e) => debug!("Error polling node '{node_name}': {e}"),
}
sleep(poll).await;
}
}
async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
let api: Api<Pod> = Api::all(self.client.clone());
Ok(api
.list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
.await?
.items)
}
fn is_mirror_pod(pod: &Pod) -> bool {
pod.metadata
.annotations
.as_ref()
.map(|a| a.contains_key("kubernetes.io/config.mirror"))
.unwrap_or(false)
}
fn is_daemonset_pod(pod: &Pod) -> bool {
pod.metadata
.owner_references
.as_ref()
.map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
.unwrap_or(false)
}
fn has_emptydir_volume(pod: &Pod) -> bool {
pod.spec
.as_ref()
.and_then(|s| s.volumes.as_ref())
.map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
.unwrap_or(false)
}
fn is_completed_pod(pod: &Pod) -> bool {
pod.status
.as_ref()
.and_then(|s| s.phase.as_deref())
.map(|phase| phase == "Succeeded" || phase == "Failed")
.unwrap_or(false)
}
fn classify_pods_for_drain(
pods: &[Pod],
options: &DrainOptions,
) -> Result<(Vec<Pod>, Vec<String>), String> {
let mut evictable = Vec::new();
let mut skipped = Vec::new();
let mut blocking = Vec::new();
for pod in pods {
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
let qualified = format!("{ns}/{name}");
if Self::is_mirror_pod(pod) {
skipped.push(format!("{qualified} (mirror pod)"));
continue;
}
if Self::is_completed_pod(pod) {
skipped.push(format!("{qualified} (completed)"));
continue;
}
if Self::is_daemonset_pod(pod) {
if options.ignore_daemonsets {
skipped.push(format!("{qualified} (DaemonSet-managed)"));
} else {
blocking.push(format!(
"{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
));
}
continue;
}
if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
blocking.push(format!(
"{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
));
continue;
}
evictable.push(pod.clone());
}
if !blocking.is_empty() {
return Err(format!(
"Cannot drain node — the following pods block eviction:\n - {}",
blocking.join("\n - ")
));
}
Ok((evictable, skipped))
}
async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
let name = pod.metadata.name.as_deref().unwrap_or_default();
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
debug!("Evicting pod {ns}/{name}");
Api::<Pod>::namespaced(self.client.clone(), ns)
.evict(name, &EvictParams::default())
.await
.map(|_| ())
}
/// Drains a node: cordon → classify → evict & wait.
pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
debug!("Cordoning '{node_name}'");
self.cordon_node(node_name).await?;
let pods = self.list_pods_on_node(node_name).await?;
debug!("Found {} pod(s) on '{node_name}'", pods.len());
let (evictable, skipped) =
Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
error!("{msg}");
Error::Discovery(DiscoveryError::MissingResource(msg))
})?;
for s in &skipped {
info!("Skipping pod: {s}");
}
if evictable.is_empty() {
info!("No pods to evict on '{node_name}'");
return Ok(());
}
info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
let mut start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
let mut pending = evictable;
loop {
for pod in &pending {
match self.evict_pod(pod).await {
Ok(()) => {}
Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
Err(Error::Api(ErrorResponse { code: 429, .. })) => {
warn!(
"PDB blocked eviction of {}/{}; will retry",
pod.metadata.namespace.as_deref().unwrap_or(""),
pod.metadata.name.as_deref().unwrap_or("")
);
}
Err(e) => {
error!(
"Failed to evict {}/{}: {e}",
pod.metadata.namespace.as_deref().unwrap_or(""),
pod.metadata.name.as_deref().unwrap_or("")
);
return Err(e);
}
}
}
sleep(poll).await;
let mut still_present = Vec::new();
for pod in pending {
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
let name = pod.metadata.name.as_deref().unwrap_or_default();
match self.get_pod(name, Some(ns)).await? {
Some(_) => still_present.push(pod),
None => debug!("Pod {ns}/{name} evicted"),
}
}
pending = still_present;
if pending.is_empty() {
break;
}
if start.elapsed() > options.timeout {
match helper::prompt_drain_timeout_action(
node_name,
pending.len(),
options.timeout,
)? {
helper::DrainTimeoutAction::Accept => break,
helper::DrainTimeoutAction::Retry => {
start = tokio::time::Instant::now();
continue;
}
helper::DrainTimeoutAction::Abort => {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Drain aborted. {} pod(s) remaining on '{node_name}'",
pending.len()
))));
}
}
}
debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
}
debug!("'{node_name}' drained successfully");
Ok(())
}
/// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
pub async fn reboot_node(
&self,
node_name: &str,
drain_options: &DrainOptions,
timeout: Duration,
) -> Result<(), Error> {
info!("Starting reboot for '{node_name}'");
let node_api: Api<Node> = Api::all(self.client.clone());
let boot_id_before = node_api
.get(node_name)
.await?
.status
.as_ref()
.and_then(|s| s.node_info.as_ref())
.map(|ni| ni.boot_id.clone())
.ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' has no boot_id in status"
)))
})?;
info!("Draining '{node_name}'");
self.drain_node(node_name, drain_options).await?;
let start = tokio::time::Instant::now();
info!("Scheduling reboot for '{node_name}'");
let reboot_cmd =
"echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
match self
.run_privileged_command_on_node(node_name, reboot_cmd)
.await
{
Ok(_) => debug!("Reboot command dispatched"),
Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
}
info!("Waiting for '{node_name}' to begin shutdown");
self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
.await?;
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timeout during reboot of '{node_name}' (shutdown phase)"
))));
}
info!("Waiting for '{node_name}' to come back online");
self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
.await?;
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timeout during reboot of '{node_name}' (ready phase)"
))));
}
let boot_id_after = node_api
.get(node_name)
.await?
.status
.as_ref()
.and_then(|s| s.node_info.as_ref())
.map(|ni| ni.boot_id.clone())
.ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' has no boot_id after reboot"
)))
})?;
if boot_id_before == boot_id_after {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
))));
}
info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
self.uncordon_node(node_name).await?;
info!("'{node_name}' reboot complete ({:?})", start.elapsed());
Ok(())
}
/// Write a set of files to a node's filesystem via a privileged ephemeral pod.
pub async fn write_files_to_node(
&self,
node_name: &str,
files: &[NodeFile],
) -> Result<String, Error> {
let ns = self.client.default_namespace();
let suffix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis();
let name = format!("harmony-k8s-writer-{suffix}");
debug!("Writing {} file(s) to '{node_name}'", files.len());
let mut data = BTreeMap::new();
let mut script = String::from("set -e\n");
for (i, file) in files.iter().enumerate() {
let key = format!("f{i}");
data.insert(key.clone(), file.content.clone());
script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
}
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(data),
..Default::default()
};
let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
cm_api.create(&PostParams::default(), &cm).await?;
debug!("Created ConfigMap '{name}'");
let (host_vol, host_mount) = helper::host_root_volume();
let payload_vol = Volume {
name: "payload".to_string(),
config_map: Some(ConfigMapVolumeSource {
name: name.clone(),
..Default::default()
}),
..Default::default()
};
let payload_mount = VolumeMount {
name: "payload".to_string(),
mount_path: "/payload".to_string(),
..Default::default()
};
let bundle = helper::build_privileged_bundle(
PrivilegedPodConfig {
name: name.clone(),
namespace: ns.to_string(),
node_name: node_name.to_string(),
container_name: "writer".to_string(),
command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
volumes: vec![payload_vol, host_vol],
volume_mounts: vec![payload_mount, host_mount],
host_pid: false,
host_network: false,
},
&self.get_k8s_distribution().await?,
);
bundle.apply(self).await?;
debug!("Created privileged pod bundle '{name}'");
let result = self.wait_for_pod_completion(&name, ns).await;
debug!("Cleaning up '{name}'");
let _ = bundle.delete(self).await;
let _ = cm_api.delete(&name, &DeleteParams::default()).await;
result
}
/// Run a privileged command on a node via an ephemeral pod.
pub async fn run_privileged_command_on_node(
&self,
node_name: &str,
command: &str,
) -> Result<String, Error> {
let namespace = self.client.default_namespace();
let suffix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis();
let name = format!("harmony-k8s-cmd-{suffix}");
debug!("Running privileged command on '{node_name}': {command}");
let (host_vol, host_mount) = helper::host_root_volume();
let bundle = helper::build_privileged_bundle(
PrivilegedPodConfig {
name: name.clone(),
namespace: namespace.to_string(),
node_name: node_name.to_string(),
container_name: "runner".to_string(),
command: vec![
"/bin/bash".to_string(),
"-c".to_string(),
command.to_string(),
],
volumes: vec![host_vol],
volume_mounts: vec![host_mount],
host_pid: true,
host_network: true,
},
&self.get_k8s_distribution().await?,
);
bundle.apply(self).await?;
debug!("Privileged pod '{name}' created");
let result = self.wait_for_pod_completion(&name, namespace).await;
debug!("Cleaning up '{name}'");
let _ = bundle.delete(self).await;
result
}
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
use super::*;
fn base_pod(name: &str, ns: &str) -> Pod {
Pod {
metadata: ObjectMeta {
name: Some(name.to_string()),
namespace: Some(ns.to_string()),
..Default::default()
},
spec: Some(PodSpec::default()),
status: Some(PodStatus {
phase: Some("Running".to_string()),
..Default::default()
}),
}
}
fn mirror_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
"kubernetes.io/config.mirror".to_string(),
"abc123".to_string(),
)]));
pod
}
fn daemonset_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.metadata.owner_references = Some(vec![OwnerReference {
api_version: "apps/v1".to_string(),
kind: "DaemonSet".to_string(),
name: "some-ds".to_string(),
uid: "uid-ds".to_string(),
..Default::default()
}]);
pod
}
fn emptydir_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.spec = Some(PodSpec {
volumes: Some(vec![Volume {
name: "scratch".to_string(),
empty_dir: Some(EmptyDirVolumeSource::default()),
..Default::default()
}]),
..Default::default()
});
pod
}
fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.status = Some(PodStatus {
phase: Some(phase.to_string()),
..Default::default()
});
pod
}
fn default_opts() -> DrainOptions {
DrainOptions::default()
}
// All test bodies are identical to the original — only the module path changed.
#[test]
fn empty_pod_list_returns_empty_vecs() {
let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s.is_empty());
}
#[test]
fn normal_pod_is_evictable() {
let pods = vec![base_pod("web", "default")];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert_eq!(e.len(), 1);
assert!(s.is_empty());
}
#[test]
fn mirror_pod_is_skipped() {
let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("mirror pod"));
}
#[test]
fn completed_pods_are_skipped() {
for phase in ["Succeeded", "Failed"] {
let pods = vec![completed_pod("job", "batch", phase)];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("completed"));
}
}
#[test]
fn daemonset_skipped_when_ignored() {
let pods = vec![daemonset_pod("fluentd", "logging")];
let opts = DrainOptions {
ignore_daemonsets: true,
..default_opts()
};
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("DaemonSet-managed"));
}
#[test]
fn daemonset_blocks_when_not_ignored() {
let pods = vec![daemonset_pod("fluentd", "logging")];
let opts = DrainOptions {
ignore_daemonsets: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
}
#[test]
fn emptydir_blocks_without_flag() {
let pods = vec![emptydir_pod("cache", "default")];
let opts = DrainOptions {
delete_emptydir_data: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("emptyDir") && err.contains("default/cache"));
}
#[test]
fn emptydir_evictable_with_flag() {
let pods = vec![emptydir_pod("cache", "default")];
let opts = DrainOptions {
delete_emptydir_data: true,
..default_opts()
};
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
assert_eq!(e.len(), 1);
assert!(s.is_empty());
}
#[test]
fn multiple_blocking_all_reported() {
let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
let opts = DrainOptions {
ignore_daemonsets: false,
delete_emptydir_data: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
}
#[test]
fn mixed_pods_classified_correctly() {
let pods = vec![
base_pod("web", "default"),
mirror_pod("kube-apiserver", "kube-system"),
daemonset_pod("fluentd", "logging"),
completed_pod("job", "batch", "Succeeded"),
base_pod("api", "default"),
];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
let names: Vec<&str> = e
.iter()
.map(|p| p.metadata.name.as_deref().unwrap())
.collect();
assert_eq!(names, vec!["web", "api"]);
assert_eq!(s.len(), 3);
}
#[test]
fn mirror_checked_before_completed() {
let mut pod = mirror_pod("static-etcd", "kube-system");
pod.status = Some(PodStatus {
phase: Some("Succeeded".to_string()),
..Default::default()
});
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
}
#[test]
fn completed_checked_before_daemonset() {
let mut pod = daemonset_pod("collector", "monitoring");
pod.status = Some(PodStatus {
phase: Some("Failed".to_string()),
..Default::default()
});
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
assert!(s[0].contains("completed"), "got: {}", s[0]);
}
}

193
harmony-k8s/src/pod.rs Normal file
View File

@@ -0,0 +1,193 @@
use std::time::Duration;
use k8s_openapi::api::core::v1::Pod;
use kube::{
Error,
api::{Api, AttachParams, ListParams},
error::DiscoveryError,
runtime::reflector::Lookup,
};
use log::debug;
use tokio::io::AsyncReadExt;
use tokio::time::sleep;
use crate::client::K8sClient;
impl K8sClient {
pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
api.get_opt(name).await
}
pub async fn wait_for_pod_ready(
&self,
pod_name: &str,
namespace: Option<&str>,
) -> Result<(), Error> {
let mut elapsed = 0u64;
let interval = 5u64;
let timeout_secs = 120u64;
loop {
if let Some(p) = self.get_pod(pod_name, namespace).await? {
if let Some(phase) = p.status.and_then(|s| s.phase) {
if phase.to_lowercase() == "running" {
return Ok(());
}
}
}
if elapsed >= timeout_secs {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Pod '{}' in '{}' did not become ready within {timeout_secs}s",
pod_name,
namespace.unwrap_or("<default>"),
))));
}
sleep(Duration::from_secs(interval)).await;
elapsed += interval;
}
}
/// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
/// logs. Used internally by node operations.
pub(crate) async fn wait_for_pod_completion(
&self,
name: &str,
namespace: &str,
) -> Result<String, Error> {
let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
let poll_interval = Duration::from_secs(2);
for _ in 0..60 {
sleep(poll_interval).await;
let p = api.get(name).await?;
match p.status.and_then(|s| s.phase).as_deref() {
Some("Succeeded") => {
let logs = api
.logs(name, &Default::default())
.await
.unwrap_or_default();
debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
return Ok(logs);
}
Some("Failed") => {
let logs = api
.logs(name, &Default::default())
.await
.unwrap_or_default();
debug!("Pod {namespace}/{name} failed. Logs: {logs}");
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Pod '{name}' failed.\n{logs}"
))));
}
_ => {}
}
}
Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timed out waiting for pod '{name}'"
))))
}
/// Execute a command in the first pod matching `{label}={name}`.
pub async fn exec_app_capture_output(
&self,
name: String,
label: String,
namespace: Option<&str>,
command: Vec<&str>,
) -> Result<String, String> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let pod_list = api
.list(&ListParams::default().labels(&format!("{label}={name}")))
.await
.expect("Failed to list pods");
let pod_name = pod_list
.items
.first()
.expect("No matching pod")
.name()
.expect("Pod has no name")
.into_owned();
match api
.exec(
&pod_name,
command,
&AttachParams::default().stdout(true).stderr(true),
)
.await
{
Err(e) => Err(e.to_string()),
Ok(mut process) => {
let status = process
.take_status()
.expect("No status handle")
.await
.expect("Status channel closed");
if let Some(s) = status.status {
let mut buf = String::new();
if let Some(mut stdout) = process.stdout() {
stdout
.read_to_string(&mut buf)
.await
.map_err(|e| format!("Failed to read stdout: {e}"))?;
}
debug!("exec status: {} - {:?}", s, status.details);
if s == "Success" { Ok(buf) } else { Err(s) }
} else {
Err("No inner status from pod exec".to_string())
}
}
}
}
/// Execute a command in the first pod matching
/// `app.kubernetes.io/name={name}`.
pub async fn exec_app(
&self,
name: String,
namespace: Option<&str>,
command: Vec<&str>,
) -> Result<(), String> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let pod_list = api
.list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
.await
.expect("Failed to list pods");
let pod_name = pod_list
.items
.first()
.expect("No matching pod")
.name()
.expect("Pod has no name")
.into_owned();
match api.exec(&pod_name, command, &AttachParams::default()).await {
Err(e) => Err(e.to_string()),
Ok(mut process) => {
let status = process
.take_status()
.expect("No status handle")
.await
.expect("Status channel closed");
if let Some(s) = status.status {
debug!("exec status: {} - {:?}", s, status.details);
if s == "Success" { Ok(()) } else { Err(s) }
} else {
Err("No inner status from pod exec".to_string())
}
}
}
}
}

View File

@@ -0,0 +1,316 @@
use std::collections::HashMap;
use k8s_openapi::api::{
apps::v1::Deployment,
core::v1::{Node, ServiceAccount},
};
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
use kube::api::ApiResource;
use kube::{
Error, Resource,
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
runtime::conditions,
runtime::wait::await_condition,
};
use log::debug;
use serde::de::DeserializeOwned;
use serde_json::Value;
use std::time::Duration;
use crate::client::K8sClient;
use crate::types::ScopeResolver;
impl K8sClient {
pub async fn has_healthy_deployment_with_label(
&self,
namespace: &str,
label_selector: &str,
) -> Result<bool, Error> {
let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
let list = api
.list(&ListParams::default().labels(label_selector))
.await?;
for d in list.items {
let available = d
.status
.as_ref()
.and_then(|s| s.available_replicas)
.unwrap_or(0);
if available > 0 {
return Ok(true);
}
if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
if conds
.iter()
.any(|c| c.type_ == "Available" && c.status == "True")
{
return Ok(true);
}
}
}
Ok(false)
}
pub async fn list_namespaces_with_healthy_deployments(
&self,
label_selector: &str,
) -> Result<Vec<String>, Error> {
let api: Api<Deployment> = Api::all(self.client.clone());
let list = api
.list(&ListParams::default().labels(label_selector))
.await?;
let mut healthy_ns: HashMap<String, bool> = HashMap::new();
for d in list.items {
let ns = match d.metadata.namespace.clone() {
Some(n) => n,
None => continue,
};
let available = d
.status
.as_ref()
.and_then(|s| s.available_replicas)
.unwrap_or(0);
let is_healthy = if available > 0 {
true
} else {
d.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|c| {
c.iter()
.any(|c| c.type_ == "Available" && c.status == "True")
})
.unwrap_or(false)
};
if is_healthy {
healthy_ns.insert(ns, true);
}
}
Ok(healthy_ns.into_keys().collect())
}
pub async fn get_controller_service_account_name(
&self,
ns: &str,
) -> Result<Option<String>, Error> {
let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
let list = api
.list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
.await?;
if let Some(dep) = list.items.first() {
if let Some(sa) = dep
.spec
.as_ref()
.and_then(|s| s.template.spec.as_ref())
.and_then(|s| s.service_account_name.clone())
{
return Ok(Some(sa));
}
}
Ok(None)
}
pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
let ar = ApiResource::from_gvk(&gvk);
let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
let list = api.list(&ListParams::default()).await?;
Ok(list
.items
.into_iter()
.map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
.collect())
}
pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
let sa_user = format!("system:serviceaccount:{ns}:{sa}");
for crb in self.list_clusterrolebindings_json().await? {
if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
for subj in subjects {
let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
|| (kind == "User" && name == sa_user)
{
return Ok(true);
}
}
}
}
Ok(false)
}
pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
let crds = api
.list(&ListParams::default().fields(&format!("metadata.name={name}")))
.await?;
Ok(!crds.items.is_empty())
}
pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
Api::namespaced(self.client.clone(), namespace)
}
pub async fn get_resource_json_value(
&self,
name: &str,
namespace: Option<&str>,
gvk: &GroupVersionKind,
) -> Result<DynamicObject, Error> {
let ar = ApiResource::from_gvk(gvk);
let api: Api<DynamicObject> = match namespace {
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
None => Api::default_namespaced_with(self.client.clone(), &ar),
};
api.get(name).await
}
pub async fn get_secret_json_value(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<DynamicObject, Error> {
self.get_resource_json_value(
name,
namespace,
&GroupVersionKind {
group: String::new(),
version: "v1".to_string(),
kind: "Secret".to_string(),
},
)
.await
}
pub async fn get_deployment(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<Option<Deployment>, Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => {
debug!("Getting namespaced deployment '{name}' in '{ns}'");
Api::namespaced(self.client.clone(), ns)
}
None => {
debug!("Getting deployment '{name}' in default namespace");
Api::default_namespaced(self.client.clone())
}
};
api.get_opt(name).await
}
pub async fn scale_deployment(
&self,
name: &str,
namespace: Option<&str>,
replicas: u32,
) -> Result<(), Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
use kube::api::{Patch, PatchParams};
use serde_json::json;
let patch = json!({ "spec": { "replicas": replicas } });
api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
.await?;
Ok(())
}
pub async fn delete_deployment(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<(), Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
api.delete(name, &kube::api::DeleteParams::default())
.await?;
Ok(())
}
pub async fn wait_until_deployment_ready(
&self,
name: &str,
namespace: Option<&str>,
timeout: Option<Duration>,
) -> Result<(), String> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let timeout = timeout.unwrap_or(Duration::from_secs(120));
let establish = await_condition(api, name, conditions::is_deployment_completed());
tokio::time::timeout(timeout, establish)
.await
.map(|_| ())
.map_err(|_| "Timed out waiting for deployment".to_string())
}
/// Gets a single named resource, using the correct API scope for `K`.
pub async fn get_resource<K>(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<Option<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
let api: Api<K> =
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
api.get_opt(name).await
}
pub async fn list_resources<K>(
&self,
namespace: Option<&str>,
list_params: Option<ListParams>,
) -> Result<ObjectList<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
let api: Api<K> =
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
api.list(&list_params.unwrap_or_default()).await
}
pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::DynamicType: Default,
{
Api::<K>::all(self.client.clone())
.list(&ListParams::default().labels(labels))
.await
.map(|l| l.items)
}
pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
Api::<K>::all(self.client.clone())
.list(&Default::default())
.await
.map(|l| l.items)
}
pub async fn get_nodes(
&self,
list_params: Option<ListParams>,
) -> Result<ObjectList<Node>, Error> {
self.list_resources(None, list_params).await
}
}

100
harmony-k8s/src/types.rs Normal file
View File

@@ -0,0 +1,100 @@
use std::time::Duration;
use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
use kube::{Api, Client, Resource};
use serde::Serialize;
/// Which Kubernetes distribution is running. Detected once at runtime via
/// [`crate::discovery::K8sClient::get_k8s_distribution`].
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub enum KubernetesDistribution {
Default,
OpenshiftFamily,
K3sFamily,
}
/// A file to be written to a node's filesystem.
#[derive(Debug, Clone)]
pub struct NodeFile {
/// Absolute path on the host where the file should be written.
pub path: String,
/// Content of the file.
pub content: String,
/// UNIX permissions (e.g. `0o600`).
pub mode: u32,
}
/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
#[derive(Debug, Clone)]
pub struct DrainOptions {
/// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
/// Equivalent to `kubectl drain --delete-emptydir-data`.
pub delete_emptydir_data: bool,
/// Silently skip DaemonSet-managed pods instead of blocking the drain.
/// Equivalent to `kubectl drain --ignore-daemonsets`.
pub ignore_daemonsets: bool,
/// Maximum wall-clock time to wait for all evictions to complete.
pub timeout: Duration,
}
impl Default for DrainOptions {
fn default() -> Self {
Self {
delete_emptydir_data: false,
ignore_daemonsets: true,
timeout: Duration::from_secs(1),
}
}
}
impl DrainOptions {
pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
Self {
delete_emptydir_data: true,
ignore_daemonsets: true,
..Self::default()
}
}
}
/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
/// resource already exists (or does not).
pub enum WriteMode {
/// Server-side apply; create if absent, update if present (default).
CreateOrUpdate,
/// POST only; return an error if the resource already exists.
Create,
/// Server-side apply only; return an error if the resource does not exist.
Update,
}
// ── Scope resolution trait ───────────────────────────────────────────────────
/// Resolves the correct [`kube::Api`] for a resource type based on its scope
/// (cluster-wide vs. namespace-scoped).
pub trait ScopeResolver<K: Resource> {
fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
}
impl<K> ScopeResolver<K> for ClusterResourceScope
where
K: Resource<Scope = ClusterResourceScope>,
<K as Resource>::DynamicType: Default,
{
fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
Api::all(client.clone())
}
}
impl<K> ScopeResolver<K> for NamespaceResourceScope
where
K: Resource<Scope = NamespaceResourceScope>,
<K as Resource>::DynamicType: Default,
{
fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
match ns {
Some(ns) => Api::namespaced(client.clone(), ns),
None => Api::default_namespaced(client.clone()),
}
}
}

View File

@@ -21,6 +21,8 @@ semver = "1.0.23"
serde.workspace = true
serde_json.workspace = true
tokio.workspace = true
tokio-retry.workspace = true
tokio-util.workspace = true
derive-new.workspace = true
log.workspace = true
env_logger.workspace = true
@@ -31,6 +33,7 @@ opnsense-config-xml = { path = "../opnsense-config-xml" }
harmony_macros = { path = "../harmony_macros" }
harmony_types = { path = "../harmony_types" }
harmony_execution = { path = "../harmony_execution" }
harmony-k8s = { path = "../harmony-k8s" }
uuid.workspace = true
url.workspace = true
kube = { workspace = true, features = ["derive"] }
@@ -60,7 +63,6 @@ temp-dir = "0.1.14"
dyn-clone = "1.0.19"
similar.workspace = true
futures-util = "0.3.31"
tokio-util = "0.7.15"
strum = { version = "0.27.1", features = ["derive"] }
tempfile.workspace = true
serde_with = "3.14.0"
@@ -80,7 +82,7 @@ sqlx.workspace = true
inquire.workspace = true
brocade = { path = "../brocade" }
option-ext = "0.2.0"
tokio-retry = "0.3.0"
rand.workspace = true
[dev-dependencies]
pretty_assertions.workspace = true

View File

@@ -4,8 +4,6 @@ use std::error::Error;
use async_trait::async_trait;
use derive_new::new;
use crate::inventory::HostRole;
use super::{
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
};

View File

@@ -1,4 +1,5 @@
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use harmony_macros::ip;
use harmony_types::{
id::Id,
@@ -16,7 +17,7 @@ use super::{
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
SwitchError, TftpServer, Topology, k8s::K8sClient,
SwitchError, TftpServer, Topology,
};
use std::{
process::Command,

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@ use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration};
use async_trait::async_trait;
use base64::{Engine, engine::general_purpose};
use harmony_k8s::{K8sClient, KubernetesDistribution};
use harmony_types::rfc1123::Rfc1123Name;
use k8s_openapi::api::{
core::v1::{Pod, Secret},
@@ -58,7 +59,6 @@ use crate::{
use super::super::{
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
PreparationOutcome, Topology,
k8s::K8sClient,
oberservability::monitoring::AlertReceiver,
tenant::{
TenantConfig, TenantManager,
@@ -76,13 +76,6 @@ struct K8sState {
message: String,
}
#[derive(Debug, Clone, Serialize)]
pub enum KubernetesDistribution {
OpenshiftFamily,
K3sFamily,
Default,
}
#[derive(Debug, Clone)]
enum K8sSource {
LocalK3d,

View File

@@ -16,7 +16,6 @@ pub mod tenant;
use derive_new::new;
pub use k8s_anywhere::*;
pub use localhost::*;
pub mod k8s;
mod load_balancer;
pub mod router;
mod tftp;

View File

@@ -9,6 +9,7 @@ use std::{
use async_trait::async_trait;
use brocade::PortOperatingMode;
use derive_new::new;
use harmony_k8s::K8sClient;
use harmony_types::{
id::Id,
net::{IpAddress, MacAddress},
@@ -18,7 +19,7 @@ use serde::Serialize;
use crate::executors::ExecutorError;
use super::{LogicalHost, k8s::K8sClient};
use super::LogicalHost;
#[derive(Debug)]
pub struct DHCPStaticEntry {

View File

@@ -1,10 +1,8 @@
use std::sync::Arc;
use crate::{
executors::ExecutorError,
topology::k8s::{ApplyStrategy, K8sClient},
};
use crate::executors::ExecutorError;
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use k8s_openapi::{
api::{
core::v1::{LimitRange, Namespace, ResourceQuota},
@@ -14,7 +12,7 @@ use k8s_openapi::{
},
apimachinery::pkg::util::intstr::IntOrString,
};
use kube::{Resource, api::DynamicObject};
use kube::Resource;
use log::debug;
use serde::de::DeserializeOwned;
use serde_json::json;
@@ -59,7 +57,6 @@ impl K8sTenantManager {
) -> Result<K, ExecutorError>
where
<K as kube::Resource>::DynamicType: Default,
<K as kube::Resource>::Scope: ApplyStrategy<K>,
{
self.apply_labels(&mut resource, config);
self.k8s_client

View File

@@ -5,6 +5,7 @@ use std::{
use askama::Template;
use async_trait::async_trait;
use harmony_k8s::{DrainOptions, K8sClient, NodeFile};
use harmony_types::id::Id;
use k8s_openapi::api::core::v1::Node;
use kube::{
@@ -15,10 +16,7 @@ use log::{debug, info, warn};
use crate::{
modules::okd::crd::nmstate,
topology::{
HostNetworkConfig, NetworkError, NetworkManager,
k8s::{DrainOptions, K8sClient, NodeFile},
},
topology::{HostNetworkConfig, NetworkError, NetworkManager},
};
/// NetworkManager bond configuration template

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use log::{debug, info, trace};
use log::{debug, info};
use serde::Serialize;
use std::path::PathBuf;

View File

@@ -1,4 +1,5 @@
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use harmony_macros::hurl;
use log::{debug, info, trace, warn};
use non_blank_string_rs::NonBlankString;
@@ -14,7 +15,7 @@ use crate::{
helm::chart::{HelmChartScore, HelmRepository},
},
score::Score,
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress, k8s::K8sClient},
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress},
};
use harmony_types::id::Id;

View File

@@ -1,8 +1,9 @@
use std::sync::Arc;
use harmony_k8s::K8sClient;
use log::{debug, info};
use crate::{interpret::InterpretError, topology::k8s::K8sClient};
use crate::interpret::InterpretError;
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ArgoScope {

View File

@@ -1,3 +1,4 @@
use harmony_k8s::K8sClient;
use std::sync::Arc;
use async_trait::async_trait;
@@ -11,7 +12,7 @@ use crate::{
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score,
topology::{K8sclient, Topology, k8s::K8sClient},
topology::{K8sclient, Topology},
};
#[derive(Clone, Debug, Serialize)]

View File

@@ -3,7 +3,8 @@ use std::sync::Arc;
use async_trait::async_trait;
use log::warn;
use crate::topology::{FailoverTopology, K8sclient, k8s::K8sClient};
use crate::topology::{FailoverTopology, K8sclient};
use harmony_k8s::K8sClient;
#[async_trait]
impl<T: K8sclient> K8sclient for FailoverTopology<T> {

View File

@@ -109,7 +109,7 @@ where
topology
.k8s_client()
.await
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
.apply_many(&self.score.resource, self.score.namespace.as_deref())
.await?;

View File

@@ -6,7 +6,7 @@ use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::{
interpret::{InterpretError, Outcome},
interpret::InterpretError,
inventory::Inventory,
modules::{
monitoring::{
@@ -17,10 +17,10 @@ use crate::{
topology::{
K8sclient, Topology,
installable::Installable,
k8s::K8sClient,
oberservability::monitoring::{AlertReceiver, AlertSender, ScrapeTarget},
},
};
use harmony_k8s::K8sClient;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(

View File

@@ -4,10 +4,8 @@ use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::topology::{
k8s::K8sClient,
oberservability::monitoring::{AlertReceiver, AlertSender},
};
use crate::topology::oberservability::monitoring::{AlertReceiver, AlertSender};
use harmony_k8s::K8sClient;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(

View File

@@ -11,8 +11,9 @@ use crate::{
inventory::Inventory,
modules::monitoring::ntfy::helm::ntfy_helm_chart::ntfy_helm_chart_score,
score::Score,
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology, k8s::K8sClient},
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology},
};
use harmony_k8s::K8sClient;
use harmony_types::id::Id;
#[derive(Debug, Clone, Serialize)]

View File

@@ -1,9 +1,7 @@
use std::{collections::BTreeMap, sync::Arc};
use crate::{
interpret::{InterpretError, Outcome},
topology::k8s::K8sClient,
};
use crate::interpret::{InterpretError, Outcome};
use harmony_k8s::K8sClient;
use k8s_openapi::api::core::v1::ConfigMap;
use kube::api::ObjectMeta;

View File

@@ -1,6 +1,7 @@
use std::{collections::BTreeMap, str::FromStr};
use async_trait::async_trait;
use harmony_k8s::KubernetesDistribution;
use harmony_macros::hurl;
use harmony_secret::{Secret, SecretManager};
use harmony_types::id::Id;
@@ -25,7 +26,7 @@ use crate::{
},
},
score::Score,
topology::{HelmCommand, K8sclient, KubernetesDistribution, TlsRouter, Topology},
topology::{HelmCommand, K8sclient, TlsRouter, Topology},
};
#[derive(Debug, Clone, Serialize)]

View File

@@ -15,6 +15,7 @@ use serde::{Deserialize, Serialize};
#[serde(rename_all = "camelCase")]
pub struct ClusterSpec {
pub instances: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_name: Option<String>,
pub storage: Storage,
pub bootstrap: Bootstrap,

View File

@@ -20,7 +20,7 @@ use crate::topology::{K8sclient, Topology};
/// # Usage
/// ```
/// use harmony::modules::postgresql::CloudNativePgOperatorScore;
/// let score = CloudNativePgOperatorScore::default();
/// let score = CloudNativePgOperatorScore::default_openshift();
/// ```
///
/// Or, you can take control of most relevant fiedls this way :
@@ -53,7 +53,7 @@ pub struct CloudNativePgOperatorScore {
}
impl CloudNativePgOperatorScore {
fn default_openshift() -> Self {
pub fn default_openshift() -> Self {
Self {
namespace: "openshift-operators".to_string(),
channel: "stable-v1".to_string(),

View File

@@ -12,8 +12,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::C
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, GrafanaSpec,
GrafanaValueFrom, GrafanaValueSource,
GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
@@ -23,7 +22,7 @@ use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
};
use crate::topology::oberservability::monitoring::AlertReceiver;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::topology::{K8sclient, Topology};
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
@@ -38,6 +37,7 @@ use crate::{
},
score::Score,
};
use harmony_k8s::K8sClient;
use harmony_types::id::Id;
use super::prometheus::PrometheusMonitoring;

View File

@@ -30,12 +30,13 @@ use crate::modules::monitoring::kube_prometheus::crd::rhob_service_monitor::{
use crate::score::Score;
use crate::topology::ingress::Ingress;
use crate::topology::oberservability::monitoring::AlertReceiver;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::topology::{K8sclient, Topology};
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
};
use harmony_k8s::K8sClient;
use harmony_types::id::Id;
use super::prometheus::PrometheusMonitoring;

View File

@@ -4,6 +4,7 @@ use std::{
};
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use log::{debug, warn};
use serde::{Deserialize, Serialize};
use tokio::time::sleep;
@@ -13,7 +14,7 @@ use crate::{
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score,
topology::{K8sclient, Topology, k8s::K8sClient},
topology::{K8sclient, Topology},
};
use harmony_types::id::Id;

View File

@@ -9,8 +9,9 @@ use crate::{
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score,
topology::{K8sclient, Topology, k8s::K8sClient},
topology::{K8sclient, Topology},
};
use harmony_k8s::K8sClient;
use harmony_types::id::Id;
#[derive(Clone, Debug, Serialize)]

View File

@@ -1,9 +1,9 @@
use base64::{Engine, prelude::BASE64_STANDARD};
use rand::{thread_rng, Rng};
use rand::distributions::Alphanumeric;
use k8s_openapi::api::core::v1::Namespace;
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use k8s_openapi::{ByteString, api::core::v1::Secret};
use kube::{Error as KubeError, core::ErrorResponse};
use rand::distr::Distribution;
use rand::{Rng, rng, seq::SliceRandom};
use std::collections::BTreeMap;
use std::str::FromStr;
@@ -38,12 +38,17 @@ const MASTERKEY_SECRET_NAME: &str = "zitadel-masterkey";
/// Photos, and others.
///
/// # Ingress annotations
/// No controller-specific ingress annotations are set. The Zitadel service
/// already carries the Traefik h2c annotation for k3s/k3d by default.
/// Add annotations via `values_overrides` depending on your distribution:
/// No controller-specific ingress annotations are set by default. On
/// OKD/OpenShift, the ingress should request TLS so the generated Route is
/// edge-terminated instead of HTTP-only. Optional cert-manager annotations are
/// included for clusters that have cert-manager installed; clusters without
/// cert-manager will ignore them.
/// Add or adjust annotations via `values_overrides` depending on your
/// distribution:
/// - NGINX: `nginx.ingress.kubernetes.io/backend-protocol: GRPC`
/// - OpenShift HAProxy: `haproxy.router.openshift.io/*` or use OpenShift Routes
/// - OpenShift HAProxy: `route.openshift.io/termination: edge`
/// - AWS ALB: set `ingress.controller: aws`
///
/// # Database credentials
/// CNPG creates a `<cluster>-superuser` secret with key `password`. Because
@@ -57,6 +62,7 @@ const MASTERKEY_SECRET_NAME: &str = "zitadel-masterkey";
pub struct ZitadelScore {
/// External domain (e.g. `"auth.example.com"`).
pub host: String,
pub zitadel_version: String,
}
impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelScore {
@@ -68,6 +74,7 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelSco
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(ZitadelInterpret {
host: self.host.clone(),
zitadel_version: self.zitadel_version.clone(),
})
}
}
@@ -77,6 +84,7 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelSco
#[derive(Debug, Clone)]
struct ZitadelInterpret {
host: String,
zitadel_version: String,
}
#[async_trait]
@@ -165,13 +173,55 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
let db_port = endpoint.port;
let host = &self.host;
debug!(
"[Zitadel] DB credentials source — secret: '{pg_user_secret}', key: 'password'"
);
debug!("[Zitadel] DB credentials source — secret: '{pg_user_secret}', key: 'password'");
debug!(
"[Zitadel] DB credentials source — superuser secret: '{pg_superuser_secret}', key: 'password'"
);
// Zitadel requires one symbol, one number and more. So let's force it.
fn generate_secure_password(length: usize) -> String {
const ALPHA_UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const ALPHA_LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
const DIGITS: &[u8] = b"0123456789";
const SYMBOLS: &[u8] = b"!@#$%^&*()_+-=[]{}|;:',.<>?/";
let mut rng = rand::rng();
let uniform_alpha_upper = rand::distr::Uniform::new(0, ALPHA_UPPER.len())
.expect("Failed to create distribution");
let uniform_alpha_lower = rand::distr::Uniform::new(0, ALPHA_LOWER.len())
.expect("Failed to create distribution");
let uniform_digits =
rand::distr::Uniform::new(0, DIGITS.len()).expect("Failed to create distribution");
let uniform_symbols =
rand::distr::Uniform::new(0, SYMBOLS.len()).expect("Failed to create distribution");
let mut chars: Vec<char> = Vec::with_capacity(length);
// Ensure at least one of each: upper, lower, digit, symbol
chars.push(ALPHA_UPPER[uniform_alpha_upper.sample(&mut rng)] as char);
chars.push(ALPHA_LOWER[uniform_alpha_lower.sample(&mut rng)] as char);
chars.push(DIGITS[uniform_digits.sample(&mut rng)] as char);
chars.push(SYMBOLS[uniform_symbols.sample(&mut rng)] as char);
// Fill remaining with random from all categories
let all_chars: Vec<u8> = [ALPHA_UPPER, ALPHA_LOWER, DIGITS, SYMBOLS].concat();
let uniform_all = rand::distr::Uniform::new(0, all_chars.len())
.expect("Failed to create distribution");
for _ in 0..(length - 4) {
chars.push(all_chars[uniform_all.sample(&mut rng)] as char);
}
// Shuffle
let mut shuffled = chars;
shuffled.shuffle(&mut rng);
return shuffled.iter().collect();
}
let admin_password = generate_secure_password(16);
// --- Step 3: Create masterkey secret ------------------------------------
debug!(
@@ -179,17 +229,20 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
MASTERKEY_SECRET_NAME, NAMESPACE
);
// Masterkey for symmetric encryption — must be exactly 32 ASCII bytes.
let masterkey: String = thread_rng()
.sample_iter(&Alphanumeric)
// Masterkey for symmetric encryption — must be exactly 32 ASCII bytes (alphanumeric only).
let masterkey = rng()
.sample_iter(&rand::distr::Alphanumeric)
.take(32)
.map(char::from)
.collect();
let masterkey_bytes = BASE64_STANDARD.encode(&masterkey);
.collect::<String>();
debug!(
"[Zitadel] Created masterkey secret '{}' in namespace '{}'",
MASTERKEY_SECRET_NAME, NAMESPACE
);
let mut masterkey_data: BTreeMap<String, ByteString> = BTreeMap::new();
masterkey_data.insert("masterkey".to_string(), ByteString(masterkey_bytes.into()));
masterkey_data.insert("masterkey".to_string(), ByteString(masterkey.into()));
let masterkey_secret = Secret {
metadata: ObjectMeta {
@@ -201,43 +254,65 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
..Secret::default()
};
topology
match topology
.k8s_client()
.await
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))
.create(masterkey_secret)
.await?;
K8sResourceScore::single(masterkey_secret, Some(NAMESPACE.to_string()))
.interpret(inventory, topology)
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
.create(&masterkey_secret, Some(NAMESPACE))
.await
.map_err(|e| {
let msg = format!("[Zitadel] Failed to create masterkey secret: {e}");
{
Ok(_) => {
info!(
"[Zitadel] Masterkey secret '{}' created",
MASTERKEY_SECRET_NAME
);
}
Err(KubeError::Api(ErrorResponse { code: 409, .. })) => {
info!(
"[Zitadel] Masterkey secret '{}' already exists, leaving it untouched",
MASTERKEY_SECRET_NAME
);
}
Err(other) => {
let msg = format!(
"[Zitadel] Failed to create masterkey secret '{}': {other}",
MASTERKEY_SECRET_NAME
);
error!("{msg}");
InterpretError::new(msg)
})?;
return Err(InterpretError::new(msg));
}
};
info!(
"[Zitadel] Masterkey secret '{}' created",
debug!(
"[Zitadel] Masterkey secret '{}' created successfully",
MASTERKEY_SECRET_NAME
);
// --- Step 4: Build Helm values ------------------------------------
warn!(
"[Zitadel] No ingress controller annotations are set. \
Add controller-specific annotations for your distribution: \
NGINX → 'nginx.ingress.kubernetes.io/backend-protocol: GRPC'; \
OpenShift HAProxy → 'haproxy.router.openshift.io/*' or use Routes; \
AWS ALB → set ingress.controller=aws."
"[Zitadel] Applying TLS-enabled ingress defaults for OKD/OpenShift. \
cert-manager annotations are included as optional hints and are \
ignored on clusters without cert-manager."
);
let values_yaml = format!(
r#"zitadel:
r#"image:
tag: {zitadel_version}
zitadel:
masterkeySecretName: "{MASTERKEY_SECRET_NAME}"
configmapConfig:
ExternalDomain: "{host}"
ExternalSecure: true
FirstInstance:
Org:
Human:
UserName: "admin"
Password: "{admin_password}"
FirstName: "Zitadel"
LastName: "Admin"
Email: "admin@zitadel.example.com"
PasswordChangeRequired: true
TLS:
Enabled: false
Database:
@@ -333,12 +408,19 @@ setupJob:
type: RuntimeDefault
ingress:
enabled: true
annotations: {{}}
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
route.openshift.io/termination: edge
hosts:
- host: "{host}"
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- "{host}"
secretName: "{host}-tls"
login:
enabled: true
podSecurityContext:
@@ -359,12 +441,19 @@ login:
type: RuntimeDefault
ingress:
enabled: true
annotations: {{}}
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
route.openshift.io/termination: edge
hosts:
- host: "{host}"
paths:
- path: /ui/v2/login
pathType: Prefix"#
pathType: Prefix
tls:
- hosts:
- "{host}"
secretName: "{host}-tls""#,
zitadel_version = self.zitadel_version
);
trace!("[Zitadel] Helm values YAML:\n{values_yaml}");
@@ -394,7 +483,17 @@ login:
.await;
match &result {
Ok(_) => info!("[Zitadel] Helm chart deployed successfully"),
Ok(_) => info!(
"[Zitadel] Helm chart deployed successfully\n\n\
===== ZITADEL DEPLOYMENT COMPLETE =====\n\
Login URL: https://{host}\n\
Username: admin@zitadel.{host}\n\
Password: {admin_password}\n\n\
IMPORTANT: The password is saved in ConfigMap 'zitadel-config-yaml'\n\
and must be changed on first login. Save the credentials in a\n\
secure location after changing them.\n\
========================================="
),
Err(e) => error!("[Zitadel] Helm chart deployment failed: {e}"),
}

View File

@@ -13,3 +13,5 @@ env_logger.workspace = true
log.workspace = true
tokio.workspace = true
reqwest.workspace = true
chrono.workspace = true
tower = "0.5.3"

View File

@@ -4,10 +4,11 @@
Designed for **bare-metal Kubernetes clusters** with external load balancers (HAProxy, OPNsense, F5, etc.).
It exposes a simple, reliable HTTP endpoint (`/health`) on each node that returns:
Exposes a simple HTTP endpoint (`/health`) on each node:
- **200 OK** — node is healthy and ready to receive traffic
- **200 OK** — node is healthy and ready to receive traffic
- **503 Service Unavailable** — node should be removed from the load balancer pool
- **500 Internal Server Error** — misconfiguration (e.g. `NODE_NAME` not set)
This project is **not dependent on Harmony**, but is commonly used as part of Harmony bare-metal Kubernetes deployments.
@@ -16,199 +17,181 @@ This project is **not dependent on Harmony**, but is commonly used as part of Ha
In bare-metal environments, external load balancers often rely on pod-level or router-level checks that can lag behind the authoritative Kubernetes `Node.status.conditions[Ready]`.
This service provides the true source-of-truth with fast reaction time.
## Features & Roadmap
## Available checks
| Check | Description | Status | Check Name |
|------------------------------------|--------------------------------------------------|---------------------|--------------------|
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | `node_ready` |
| **OKD Router health** | Probes OpenShift router healthz on port 1936 | **Implemented** | `okd_router_1936` |
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | `filesystem_ro` |
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | `kubelet` |
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | `container_runtime`|
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | `disk_pressure` |
| Network reachability | DNS resolution + gateway connectivity | To be implemented | `network` |
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | `custom_conditions`|
| Check name | Description | Status |
|--------------------|-------------------------------------------------------------|-------------------|
| `node_ready` | Queries `Node.status.conditions[Ready]` via Kubernetes API | Implemented |
| `okd_router_1936` | Probes OpenShift router `/healthz/ready` on port 1936 | Implemented |
| `filesystem_ro` | Detects read-only mounts via `/proc/mounts` | To be implemented |
| `kubelet` | Local probe to kubelet `/healthz` (port 10248) | To be implemented |
| `container_runtime`| Socket check + runtime status | To be implemented |
| `disk_pressure` | Threshold checks on key filesystems | To be implemented |
| `network` | DNS resolution + gateway connectivity | To be implemented |
| `custom_conditions`| Reacts to extra conditions (NPD, etc.) | To be implemented |
All checks are combined with logical **AND** — any failure results in 503.
All checks are combined with logical **AND** — any single failure results in 503.
## Behavior
### `node_ready` check — fail-open design
The `node_ready` check queries the Kubernetes API server to read `Node.status.conditions[Ready]`.
Because this service runs on the node it is checking, there are scenarios where the API server is temporarily
unreachable (e.g. during a control-plane restart). To avoid incorrectly draining a healthy node in such cases,
the check is **fail-open**: it passes (reports ready) whenever the Kubernetes API is unavailable.
| Situation | Result | HTTP status |
|------------------------------------------------------|-------------------|-------------|
| `Node.conditions[Ready] == True` | Pass | 200 |
| `Node.conditions[Ready] == False` | Fail | 503 |
| `Ready` condition absent | Fail | 503 |
| API server unreachable or timed out (1 s timeout) | Pass (assumes ready) | 200 |
| Kubernetes client initialization failed | Pass (assumes ready) | 200 |
| `NODE_NAME` env var not set | Hard error | 500 |
A warning is logged whenever the API is unavailable and the check falls back to assuming ready.
### `okd_router_1936` check
Sends `GET http://127.0.0.1:1936/healthz/ready` with a 5-second timeout.
Returns pass on any 2xx response, fail otherwise.
### Unknown check names
Requesting an unknown check name (e.g. `check=bogus`) results in that check returning `passed: false`
with reason `"Unknown check: bogus"`, and the overall response is 503.
## How it works
### Node Name Discovery
The service automatically discovers its own node name using the **Kubernetes Downward API**:
### Node name discovery
The service reads the `NODE_NAME` environment variable, which must be injected via the Kubernetes Downward API:
```yaml
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
fieldPath: spec.nodeName
```
### Kubernetes API Authentication
### Kubernetes API authentication
- Uses standard **in-cluster configuration** (no external credentials needed).
- The ServiceAccount token and CA certificate are automatically mounted by Kubernetes at `/var/run/secrets/kubernetes.io/serviceaccount/`.
- The application (via `kube-rs` or your Harmony higher-level client) calls the equivalent of `Config::incluster_config()`.
- Requires only minimal RBAC: `get` permission on the `nodes` resource (see `deploy/rbac.yaml`).
- Uses standard **in-cluster configuration** no external credentials needed.
- The ServiceAccount token and CA certificate are automatically mounted at `/var/run/secrets/kubernetes.io/serviceaccount/`.
- Requires only minimal RBAC: `get` and `list` on the `nodes` resource (see `deploy/resources.yaml`).
- Connect and write timeouts are set to **1 second** to keep checks fast.
## Quick Start
## Deploy
All Kubernetes resources (Namespace, ServiceAccount, ClusterRole, ClusterRoleBinding, and an OpenShift SCC RoleBinding for `hostnetwork`) are in a single file.
### 1. Build and push
```bash
cargo build --release --bin harmony-node-readiness-endpoint
docker build -t your-registry/harmony-node-readiness-endpoint:v1.0.0 .
docker push your-registry/harmony-node-readiness-endpoint:v1.0.0
```
### 2. Deploy
```bash
kubectl apply -f deploy/namespace.yaml
kubectl apply -f deploy/rbac.yaml
kubectl apply -f deploy/resources.yaml
kubectl apply -f deploy/daemonset.yaml
```
(The DaemonSet uses `hostPort: 25001` by default so the endpoint is reachable directly on the node's IP.)
The DaemonSet uses `hostNetwork: true` and `hostPort: 25001`, so the endpoint is reachable directly on the node's IP at port 25001.
It tolerates all taints, ensuring it runs even on nodes marked unschedulable.
### 3. Configure your external load balancer
### Configure your external load balancer
**Example for HAProxy / OPNsense:**
- Check type: **HTTP**
- URI: `/health`
- Port: `25001` (configurable via `LISTEN_PORT`)
- Port: `25001` (configurable via `LISTEN_PORT` env var)
- Interval: 510 s
- Rise: 2
- Fall: 3
- Expect: `2xx`
## Health Endpoint Examples
## Endpoint usage
### Query Parameter
### Query parameter
Use the `check` query parameter to specify which checks to run. Multiple checks can be comma-separated.
Use the `check` query parameter to select which checks to run (comma-separated).
When omitted, only `node_ready` runs.
| Request | Behavior |
|--------------------------------------|---------------------------------------------|
| `GET /health` | Runs `node_ready` (default) |
| `GET /health?check=okd_router_1936` | Runs only OKD router check |
| `GET /health?check=node_ready,okd_router_1936` | Runs both checks |
| Request | Checks run |
|------------------------------------------------|-----------------------------------|
| `GET /health` | `node_ready` |
| `GET /health?check=okd_router_1936` | `okd_router_1936` only |
| `GET /health?check=node_ready,okd_router_1936` | `node_ready` and `okd_router_1936`|
**Note:** When the `check` parameter is provided, only the specified checks run. You must explicitly include `node_ready` if you want it along with other checks.
> **Note:** specifying `check=` replaces the default. Include `node_ready` explicitly if you need it alongside other checks.
### Response Format
Each check result includes:
- `name`: The check identifier
- `passed`: Boolean indicating success or failure
- `reason`: (Optional) Failure reason if the check failed
- `duration_ms`: Time taken to execute the check in milliseconds
**Healthy node (default check)**
```http
HTTP/1.1 200 OK
Content-Type: application/json
### Response format
```json
{
"status": "ready" | "not-ready",
"checks": [
{
GET /health?check=node_ready,okd_router_1936
"name": "<check-name>",
"passed": true | false,
"reason": "<failure reason, omitted on success>",
"duration_ms": 42
}
],
"total_duration_ms": 42
}
```
```http
**Healthy node (default)**
```http
HTTP/1.1 503 Service Unavailable
HTTP/1.1 200 OK
{
"status": "ready",
```http
HTTP/1.1 503 Service Unavailable
Content-Type: application/json
```
## Configuration (via DaemonSet env vars)
```yaml
env:
- name: NODE_NAME
valueFrom:
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 42 }],
"total_duration_ms": 42
}
```
value: "25001"
**Unhealthy node**
```http
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
HTTP/1.1 503 Service Unavailable
## Development
{
"status": "not-ready",
"checks": [
```
---
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
"name": "okd_router_1936",
"passed": false,
"reason": "Failed to connect to OKD router: connection refused",
"duration_ms": 5
}
]
{ "name": "node_ready", "passed": false, "reason": "KubeletNotReady", "duration_ms": 35 }
],
"total_duration_ms": 35
}
```
**Unhealthy node (default check)**
**API server unreachable (fail-open)**
```http
HTTP/1.1 503 Service Unavailable
Content-Type: application/json
HTTP/1.1 200 OK
{
"status": "not-ready",
"checks": [
{
"name": "node_ready",
"passed": false,
"reason": "KubeletNotReady",
"duration_ms": 35
}
]
"status": "ready",
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 1001 }],
"total_duration_ms": 1001
}
```
*(A warning is logged: `Kubernetes API appears to be down … Assuming node is ready.`)*
## Configuration (via DaemonSet env vars)
## Configuration
```yaml
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: LISTEN_PORT
value: "25001"
```
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
| Env var | Default | Description |
|---------------|----------|--------------------------------------|
| `NODE_NAME` | required | Node name, injected via Downward API |
| `LISTEN_PORT` | `25001` | TCP port the HTTP server binds to |
| `RUST_LOG` | — | Log level (e.g. `info`, `debug`) |
## Development
```bash
# Run locally (set NODE_NAME env var)
# Run locally
NODE_NAME=my-test-node cargo run
# Run tests
cargo test
```
---
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*

0
harmony_node_readiness/build-docker.sh Normal file → Executable file
View File

View File

@@ -27,8 +27,8 @@ spec:
fieldRef:
fieldPath: spec.nodeName
ports:
- containerPort: 8080
hostPort: 8080
- containerPort: 25001
hostPort: 25001
name: health-port
resources:
requests:

View File

@@ -1,13 +1,16 @@
use actix_web::{App, HttpResponse, HttpServer, Responder, get, web};
use k8s_openapi::api::core::v1::Node;
use kube::{Api, Client};
use kube::{Api, Client, Config};
use log::{debug, error, info, warn};
use reqwest;
use serde::{Deserialize, Serialize};
use std::env;
use std::time::Instant;
use std::time::{Duration, Instant};
use tokio::task::JoinSet;
const K8S_CLIENT_TIMEOUT: Duration = Duration::from_secs(1);
#[derive(Serialize, Deserialize)]
struct HealthStatus {
status: String,
@@ -40,10 +43,16 @@ struct HealthQuery {
async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String> {
let nodes: Api<Node> = Api::all(client);
let node = nodes
.get(node_name)
.await
.map_err(|e| format!("Failed to get node '{}': {}", node_name, e))?;
let node = match nodes.get(node_name).await {
Ok(n) => n,
Err(e) => {
warn!(
"Kubernetes API appears to be down, unreachable, or timed out for node '{}': {}. Assuming node is ready.",
node_name, e
);
return Ok(());
}
};
let conditions = node.status.and_then(|s| s.conditions).unwrap_or_default();
@@ -104,7 +113,13 @@ async fn run_check(check_name: &str, client: Option<Client>, node_name: &str) ->
let result = match check_name {
"node_ready" => match client {
Some(c) => check_node_ready(c, node_name).await,
None => Err("Kubernetes client not available".to_string()),
None => {
warn!(
"Kubernetes client not available for node '{}'. Assuming node is ready.",
node_name
);
Ok(())
}
},
"okd_router_1936" => check_okd_router_1936().await,
_ => Err(format!("Unknown check: {}", check_name)),
@@ -149,16 +164,30 @@ async fn health(query: web::Query<HealthQuery>) -> impl Responder {
// Initialize Kubernetes client only if needed
let k8s_client = if needs_k8s_client {
match Client::try_default().await {
Ok(c) => Some(c),
match Config::infer().await {
Ok(mut config) => {
config.write_timeout = Some(K8S_CLIENT_TIMEOUT);
config.connect_timeout = Some(K8S_CLIENT_TIMEOUT);
Some(Client::try_from(config).map_err(|e| e.to_string()))
}
Err(e) => {
error!("Failed to create Kubernetes client: {}", e);
return HttpResponse::InternalServerError().json(HealthError {
status: "error".to_string(),
error: format!("Failed to create Kubernetes client: {}", e),
});
warn!(
"Failed to infer Kubernetes config for node '{}': {}. Assuming node_ready is healthy.",
node_name, e
);
None
}
}
.and_then(|result| match result {
Ok(client) => Some(client),
Err(e) => {
warn!(
"Failed to create Kubernetes client for node '{}': {}. Assuming node_ready is healthy.",
node_name, e
);
None
}
})
} else {
None
};
@@ -226,7 +255,28 @@ async fn main() -> std::io::Result<()> {
info!("Starting harmony-node-readiness-endpoint on {}", bind_addr);
HttpServer::new(|| App::new().service(health))
.workers(3)
.bind(&bind_addr)?
.run()
.await
}
#[cfg(test)]
mod tests {
use super::*;
use kube::error::ErrorResponse;
#[test]
fn parse_checks_defaults_to_node_ready() {
assert_eq!(parse_checks(None), vec!["node_ready"]);
assert_eq!(parse_checks(Some("")), vec!["node_ready"]);
}
#[test]
fn parse_checks_splits_and_trims_values() {
assert_eq!(
parse_checks(Some("node_ready, okd_router_1936 ")),
vec!["node_ready", "okd_router_1936"]
);
}
}

View File

@@ -1540,7 +1540,7 @@ pub struct Dyndns {
pub struct Vlans {
#[yaserde(attribute = true)]
pub version: String,
pub vlan: MaybeString,
pub vlan: RawXml,
}
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]