From 58b62689899f4fd450bbae9f41499a0c6ffaa9b6 Mon Sep 17 00:00:00 2001 From: Willem Date: Mon, 29 Sep 2025 10:46:29 -0400 Subject: [PATCH 01/51] wip: moving the install steps for grafana and prometheus into the trait installable --- harmony/src/domain/topology/k8s_anywhere.rs | 116 ++++++++++++++++-- .../application/features/monitoring.rs | 5 +- .../application/features/rhob_monitoring.rs | 4 +- .../application_monitoring_score.rs | 8 +- .../rhobs_application_monitoring_score.rs | 6 +- .../src/modules/monitoring/grafana/grafana.rs | 15 +++ .../monitoring/grafana/helm/helm_grafana.rs | 25 ++-- harmony/src/modules/monitoring/grafana/mod.rs | 1 + .../crd/crd_alertmanager_config.rs | 41 ++++++- .../monitoring/prometheus/prometheus.rs | 2 +- .../k8s_prometheus_alerting_score.rs | 6 +- harmony/src/modules/prometheus/prometheus.rs | 8 +- .../modules/prometheus/rhob_alerting_score.rs | 10 +- 13 files changed, 195 insertions(+), 52 deletions(-) create mode 100644 harmony/src/modules/monitoring/grafana/grafana.rs diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index e6c37ea..6dfb1a8 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -12,14 +12,17 @@ use crate::{ inventory::Inventory, modules::{ k3d::K3DInstallationScore, - monitoring::kube_prometheus::crd::{ - crd_alertmanager_config::CRDPrometheus, - prometheus_operator::prometheus_operator_helm_chart_score, - rhob_alertmanager_config::RHOBObservability, + monitoring::{ + grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score}, + kube_prometheus::crd::{ + crd_alertmanager_config::CRDPrometheus, + prometheus_operator::prometheus_operator_helm_chart_score, + rhob_alertmanager_config::RHOBObservability, service_monitor::ServiceMonitor, + }, }, prometheus::{ k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, - prometheus::PrometheusApplicationMonitoring, rhob_alerting_score::RHOBAlertingScore, + prometheus::PrometheusMonitoring, rhob_alerting_score::RHOBAlertingScore, }, }, score::Score, @@ -86,7 +89,43 @@ impl K8sclient for K8sAnywhereTopology { } #[async_trait] -impl PrometheusApplicationMonitoring for K8sAnywhereTopology { +impl Grafana for K8sAnywhereTopology { + async fn ensure_grafana_operator_ready( + &self, + inventory: &Inventory, + ) -> Result { + let client = self.k8s_client().await.unwrap(); + let grafana_gvk = GroupVersionKind { + group: "grafana.integreatly.org".to_string(), + version: "v1beta1".to_string(), + kind: "Grafana".to_string(), + }; + let name = "grafanas.grafana.integreatly.org"; + let ns = "grafana"; + + let grafana_crd = client + .get_resource_json_value(name, Some(ns), &grafana_gvk) + .await; + match grafana_crd { + Ok(_) => { + return Ok(PreparationOutcome::Success { + details: "Found grafana CRDs in cluster".to_string(), + }); + } + Err(_) => { + return self + .install_grafana_operator(inventory, Some("grafana")) + .await; + } + }; + } + async fn install_grafana(&self) -> Result { + todo!() + } +} + +#[async_trait] +impl PrometheusMonitoring for K8sAnywhereTopology { async fn install_prometheus( &self, sender: &CRDPrometheus, @@ -101,7 +140,11 @@ impl PrometheusApplicationMonitoring for K8sAnywhereTopology { } let result = self - .get_k8s_prometheus_application_score(sender.clone(), receivers) + .get_k8s_prometheus_application_score( + sender.clone(), + receivers, + Some(sender.service_monitor.clone()), + ) .await .interpret(inventory, self) .await; @@ -117,10 +160,24 @@ impl PrometheusApplicationMonitoring for K8sAnywhereTopology { Err(err) => Err(PreparationError::new(err.to_string())), } } + async fn ensure_prometheus_operator( + &self, + sender: &CRDPrometheus, + inventory: &Inventory, + ) -> Result { + let po_result = self.ensure_prometheus_operator(sender).await?; + + if po_result == PreparationOutcome::Noop { + debug!("Skipping Prometheus CR installation due to missing operator."); + return Ok(po_result); + } else { + todo!() + } + } } #[async_trait] -impl PrometheusApplicationMonitoring for K8sAnywhereTopology { +impl PrometheusMonitoring for K8sAnywhereTopology { async fn install_prometheus( &self, sender: &RHOBObservability, @@ -154,6 +211,13 @@ impl PrometheusApplicationMonitoring for K8sAnywhereTopology Err(err) => Err(PreparationError::new(err.to_string())), } } + async fn ensure_prometheus_operator( + &self, + sender: &RHOBObservability, + inventory: &Inventory, + ) -> Result { + todo!() + } } impl Serialize for K8sAnywhereTopology { @@ -253,12 +317,22 @@ impl K8sAnywhereTopology { &self, sender: CRDPrometheus, receivers: Option>>>, + service_monitors: Option>, ) -> K8sPrometheusCRDAlertingScore { - K8sPrometheusCRDAlertingScore { - sender, - receivers: receivers.unwrap_or_default(), - service_monitors: vec![], - prometheus_rules: vec![], + if let Some(sm) = service_monitors { + return K8sPrometheusCRDAlertingScore { + sender, + receivers: receivers.unwrap_or_default(), + service_monitors: sm, + prometheus_rules: vec![], + }; + } else { + return K8sPrometheusCRDAlertingScore { + sender, + receivers: receivers.unwrap_or_default(), + service_monitors: vec![], + prometheus_rules: vec![], + }; } } @@ -527,6 +601,22 @@ impl K8sAnywhereTopology { details: "prometheus operator present in cluster".into(), }) } + + async fn install_grafana_operator( + &self, + inventory: &Inventory, + ns: Option<&str>, + ) -> Result { + let _grafana_operator_score = grafana_helm_chart_score(ns.unwrap(), true) + .interpret(inventory, self) + .await; + Ok(PreparationOutcome::Success { + details: format!( + "Successfully installed grafana operator in ns {}", + ns.unwrap() + ), + }) + } } #[derive(Clone, Debug)] diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index 1a60d00..0fd155d 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -14,7 +14,7 @@ use crate::{ topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager}, }; use crate::{ - modules::prometheus::prometheus::PrometheusApplicationMonitoring, + modules::prometheus::prometheus::PrometheusMonitoring, topology::oberservability::monitoring::AlertReceiver, }; use async_trait::async_trait; @@ -40,7 +40,7 @@ impl< + TenantManager + K8sclient + MultiTargetTopology - + PrometheusApplicationMonitoring + + PrometheusMonitoring + Ingress + std::fmt::Debug, > ApplicationFeature for Monitoring @@ -61,6 +61,7 @@ impl< sender: CRDPrometheus { namespace: namespace.clone(), client: topology.k8s_client().await.unwrap(), + service_monitor: vec![], }, application: self.application.clone(), receivers: self.alert_receiver.clone(), diff --git a/harmony/src/modules/application/features/rhob_monitoring.rs b/harmony/src/modules/application/features/rhob_monitoring.rs index d87ef61..876dba9 100644 --- a/harmony/src/modules/application/features/rhob_monitoring.rs +++ b/harmony/src/modules/application/features/rhob_monitoring.rs @@ -18,7 +18,7 @@ use crate::{ topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager}, }; use crate::{ - modules::prometheus::prometheus::PrometheusApplicationMonitoring, + modules::prometheus::prometheus::PrometheusMonitoring, topology::oberservability::monitoring::AlertReceiver, }; use async_trait::async_trait; @@ -42,7 +42,7 @@ impl< + MultiTargetTopology + Ingress + std::fmt::Debug - + PrometheusApplicationMonitoring, + + PrometheusMonitoring, > ApplicationFeature for Monitoring { async fn ensure_installed( diff --git a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs index 8246d15..2780edd 100644 --- a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs @@ -10,7 +10,7 @@ use crate::{ modules::{ application::Application, monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus, - prometheus::prometheus::PrometheusApplicationMonitoring, + prometheus::prometheus::PrometheusMonitoring, }, score::Score, topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver}, @@ -24,9 +24,7 @@ pub struct ApplicationMonitoringScore { pub receivers: Vec>>, } -impl> Score - for ApplicationMonitoringScore -{ +impl> Score for ApplicationMonitoringScore { fn create_interpret(&self) -> Box> { Box::new(ApplicationMonitoringInterpret { score: self.clone(), @@ -47,7 +45,7 @@ pub struct ApplicationMonitoringInterpret { } #[async_trait] -impl> Interpret +impl> Interpret for ApplicationMonitoringInterpret { async fn execute( diff --git a/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs index 5f5127f..6f45c88 100644 --- a/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs @@ -12,7 +12,7 @@ use crate::{ monitoring::kube_prometheus::crd::{ crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability, }, - prometheus::prometheus::PrometheusApplicationMonitoring, + prometheus::prometheus::PrometheusMonitoring, }, score::Score, topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver}, @@ -26,7 +26,7 @@ pub struct ApplicationRHOBMonitoringScore { pub receivers: Vec>>, } -impl> Score +impl> Score for ApplicationRHOBMonitoringScore { fn create_interpret(&self) -> Box> { @@ -49,7 +49,7 @@ pub struct ApplicationRHOBMonitoringInterpret { } #[async_trait] -impl> Interpret +impl> Interpret for ApplicationRHOBMonitoringInterpret { async fn execute( diff --git a/harmony/src/modules/monitoring/grafana/grafana.rs b/harmony/src/modules/monitoring/grafana/grafana.rs new file mode 100644 index 0000000..411d7a6 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/grafana.rs @@ -0,0 +1,15 @@ +use async_trait::async_trait; + +use crate::{ + inventory::Inventory, + topology::{PreparationError, PreparationOutcome}, +}; + +#[async_trait] +pub trait Grafana { + async fn ensure_grafana_operator_ready( + &self, + inventory: &Inventory, + ) -> Result; + async fn install_grafana(&self) -> Result; +} diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs index 3af6550..094beca 100644 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs @@ -1,25 +1,22 @@ use non_blank_string_rs::NonBlankString; -use std::str::FromStr; +use std::{collections::HashMap, str::FromStr}; use crate::modules::helm::chart::HelmChartScore; -pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { - let values = r#" -rbac: - namespaced: true -sidecar: - dashboards: - enabled: true - "# - .to_string(); - +pub fn grafana_helm_chart_score(ns: &str, scope: bool) -> HelmChartScore { + let mut values_overrides = HashMap::new(); + values_overrides.insert( + NonBlankString::from_str("namespaceScope").unwrap(), + scope.to_string(), + ); HelmChartScore { namespace: Some(NonBlankString::from_str(ns).unwrap()), release_name: NonBlankString::from_str("grafana").unwrap(), - chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(), + chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana-operator") + .unwrap(), chart_version: None, - values_overrides: None, - values_yaml: Some(values.to_string()), + values_overrides: Some(values_overrides), + values_yaml: None, create_namespace: true, install_only: true, repository: None, diff --git a/harmony/src/modules/monitoring/grafana/mod.rs b/harmony/src/modules/monitoring/grafana/mod.rs index c821bcb..8dccab1 100644 --- a/harmony/src/modules/monitoring/grafana/mod.rs +++ b/harmony/src/modules/monitoring/grafana/mod.rs @@ -1 +1,2 @@ +pub mod grafana; pub mod helm; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs index 2165a4a..0ac8fc7 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs @@ -1,12 +1,25 @@ use std::sync::Arc; +use async_trait::async_trait; use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::topology::{ - k8s::K8sClient, - oberservability::monitoring::{AlertReceiver, AlertSender}, +use crate::{ + interpret::{InterpretError, Outcome}, + inventory::Inventory, + modules::{ + monitoring::{ + grafana::grafana::Grafana, kube_prometheus::crd::service_monitor::ServiceMonitor, + }, + prometheus::prometheus::PrometheusMonitoring, + }, + topology::{ + K8sclient, Topology, + installable::Installable, + k8s::K8sClient, + oberservability::monitoring::{AlertReceiver, AlertSender}, + }, }; #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] @@ -26,6 +39,7 @@ pub struct AlertmanagerConfigSpec { pub struct CRDPrometheus { pub namespace: String, pub client: Arc, + pub service_monitor: Vec, } impl AlertSender for CRDPrometheus { @@ -48,3 +62,24 @@ impl Serialize for Box> { todo!() } } + +#[async_trait] +impl + Grafana> Installable + for CRDPrometheus +{ + async fn configure(&self, inventory: &Inventory, topology: &T) -> Result<(), InterpretError> { + topology.ensure_grafana_operator_ready(inventory).await?; + topology.ensure_prometheus_operator(self, inventory).await?; + Ok(()) + } + + async fn ensure_installed( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result<(), InterpretError> { + topology.install_grafana().await?; + topology.install_prometheus(&self, inventory, None).await?; + Ok(()) + } +} diff --git a/harmony/src/modules/monitoring/prometheus/prometheus.rs b/harmony/src/modules/monitoring/prometheus/prometheus.rs index a207d5a..2fe0d06 100644 --- a/harmony/src/modules/monitoring/prometheus/prometheus.rs +++ b/harmony/src/modules/monitoring/prometheus/prometheus.rs @@ -114,7 +114,7 @@ impl Prometheus { }; if let Some(ns) = namespace.as_deref() { - grafana_helm_chart_score(ns) + grafana_helm_chart_score(ns, false) .interpret(inventory, topology) .await } else { diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index 24ca918..2cb4ffb 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -39,7 +39,7 @@ use crate::{ }; use harmony_types::id::Id; -use super::prometheus::PrometheusApplicationMonitoring; +use super::prometheus::PrometheusMonitoring; #[derive(Clone, Debug, Serialize)] pub struct K8sPrometheusCRDAlertingScore { @@ -49,7 +49,7 @@ pub struct K8sPrometheusCRDAlertingScore { pub prometheus_rules: Vec, } -impl> Score +impl> Score for K8sPrometheusCRDAlertingScore { fn create_interpret(&self) -> Box> { @@ -75,7 +75,7 @@ pub struct K8sPrometheusCRDAlertingInterpret { } #[async_trait] -impl> Interpret +impl> Interpret for K8sPrometheusCRDAlertingInterpret { async fn execute( diff --git a/harmony/src/modules/prometheus/prometheus.rs b/harmony/src/modules/prometheus/prometheus.rs index d3940c7..efb89da 100644 --- a/harmony/src/modules/prometheus/prometheus.rs +++ b/harmony/src/modules/prometheus/prometheus.rs @@ -9,11 +9,17 @@ use crate::{ }; #[async_trait] -pub trait PrometheusApplicationMonitoring { +pub trait PrometheusMonitoring { async fn install_prometheus( &self, sender: &S, inventory: &Inventory, receivers: Option>>>, ) -> Result; + + async fn ensure_prometheus_operator( + &self, + sender: &S, + inventory: &Inventory, + ) -> Result; } diff --git a/harmony/src/modules/prometheus/rhob_alerting_score.rs b/harmony/src/modules/prometheus/rhob_alerting_score.rs index 95908d5..644e6f9 100644 --- a/harmony/src/modules/prometheus/rhob_alerting_score.rs +++ b/harmony/src/modules/prometheus/rhob_alerting_score.rs @@ -38,7 +38,7 @@ use crate::{ }; use harmony_types::id::Id; -use super::prometheus::PrometheusApplicationMonitoring; +use super::prometheus::PrometheusMonitoring; #[derive(Clone, Debug, Serialize)] pub struct RHOBAlertingScore { @@ -48,8 +48,8 @@ pub struct RHOBAlertingScore { pub prometheus_rules: Vec, } -impl> - Score for RHOBAlertingScore +impl> Score + for RHOBAlertingScore { fn create_interpret(&self) -> Box> { Box::new(RHOBAlertingInterpret { @@ -74,8 +74,8 @@ pub struct RHOBAlertingInterpret { } #[async_trait] -impl> - Interpret for RHOBAlertingInterpret +impl> Interpret + for RHOBAlertingInterpret { async fn execute( &self, From 1f3796f50301b38746366c4d5e4909332db203dd Mon Sep 17 00:00:00 2001 From: Willem Date: Thu, 9 Oct 2025 12:26:05 -0400 Subject: [PATCH 02/51] refactor(prometheus): modified crd prometheus to impl the installable trait --- examples/try_rust_webapp/src/main.rs | 2 +- harmony/src/domain/topology/k8s_anywhere.rs | 208 ++++++++++++++---- .../topology/oberservability/monitoring.rs | 2 + .../application/features/monitoring.rs | 17 +- .../application_monitoring_score.rs | 79 ++----- .../src/modules/monitoring/grafana/grafana.rs | 4 +- .../monitoring/grafana/helm/helm_grafana.rs | 17 +- .../crd/crd_alertmanager_config.rs | 2 +- 8 files changed, 219 insertions(+), 112 deletions(-) diff --git a/examples/try_rust_webapp/src/main.rs b/examples/try_rust_webapp/src/main.rs index 56a058d..7bfdf57 100644 --- a/examples/try_rust_webapp/src/main.rs +++ b/examples/try_rust_webapp/src/main.rs @@ -3,7 +3,7 @@ use harmony::{ modules::{ application::{ ApplicationScore, RustWebFramework, RustWebapp, - features::{PackagingDeployment, rhob_monitoring::Monitoring}, + features::{Monitoring, PackagingDeployment}, }, monitoring::alert_channel::discord_alert_channel::DiscordWebhook, }, diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index 6dfb1a8..895f7da 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -1,7 +1,7 @@ -use std::{process::Command, sync::Arc}; +use std::{collections::BTreeMap, process::Command, sync::Arc}; use async_trait::async_trait; -use kube::api::GroupVersionKind; +use kube::api::{GroupVersionKind, ObjectMeta}; use log::{debug, info, warn}; use serde::Serialize; use tokio::sync::OnceCell; @@ -12,12 +12,20 @@ use crate::{ inventory::Inventory, modules::{ k3d::K3DInstallationScore, + k8s::ingress::{K8sIngressScore, PathType}, monitoring::{ grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score}, kube_prometheus::crd::{ crd_alertmanager_config::CRDPrometheus, + crd_grafana::{ + Grafana as GrafanaCRD, GrafanaDashboard, GrafanaDashboardSpec, + GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceSpec, GrafanaSpec, + }, + crd_prometheuses::LabelSelector, + grafana_default_dashboard::build_default_dashboard, prometheus_operator::prometheus_operator_helm_chart_score, - rhob_alertmanager_config::RHOBObservability, service_monitor::ServiceMonitor, + rhob_alertmanager_config::RHOBObservability, + service_monitor::ServiceMonitor, }, }, prometheus::{ @@ -90,10 +98,11 @@ impl K8sclient for K8sAnywhereTopology { #[async_trait] impl Grafana for K8sAnywhereTopology { - async fn ensure_grafana_operator_ready( + async fn ensure_grafana_operator( &self, inventory: &Inventory, ) -> Result { + debug!("ensure grafana operator"); let client = self.k8s_client().await.unwrap(); let grafana_gvk = GroupVersionKind { group: "grafana.integreatly.org".to_string(), @@ -112,6 +121,7 @@ impl Grafana for K8sAnywhereTopology { details: "Found grafana CRDs in cluster".to_string(), }); } + Err(_) => { return self .install_grafana_operator(inventory, Some("grafana")) @@ -120,7 +130,41 @@ impl Grafana for K8sAnywhereTopology { }; } async fn install_grafana(&self) -> Result { - todo!() + debug!("install grafana"); + let ns = "grafana"; + + let mut label = BTreeMap::new(); + + label.insert("dashboards".to_string(), "grafana".to_string()); + let label_selector = LabelSelector { + match_labels: label.clone(), + match_expressions: vec![], + }; + + let client = self.k8s_client().await?; + + let datasource = self.build_grafana_datasource(ns, &label_selector); + + client.apply(&datasource, Some(ns)).await?; + + let dashboard = self.build_grafana_dashboard(ns, &label_selector); + + client.apply(&dashboard, Some(ns)).await?; + + let grafana = self.build_grafana(ns, &label); + + client.apply(&grafana, Some(ns)).await?; + + let grafana_ingress = self.build_grafana_ingress(ns).await; + + grafana_ingress + .interpret(&Inventory::empty(), self) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; + + Ok(PreparationOutcome::Success { + details: "Installed grafana composants".to_string(), + }) } } @@ -129,49 +173,38 @@ impl PrometheusMonitoring for K8sAnywhereTopology { async fn install_prometheus( &self, sender: &CRDPrometheus, - inventory: &Inventory, - receivers: Option>>>, + _inventory: &Inventory, + _receivers: Option>>>, ) -> Result { - let po_result = self.ensure_prometheus_operator(sender).await?; + let client = self.k8s_client().await?; - if po_result == PreparationOutcome::Noop { - debug!("Skipping Prometheus CR installation due to missing operator."); - return Ok(po_result); - } - - let result = self - .get_k8s_prometheus_application_score( - sender.clone(), - receivers, - Some(sender.service_monitor.clone()), - ) - .await - .interpret(inventory, self) - .await; - - match result { - Ok(outcome) => match outcome.status { - InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { - details: outcome.message, - }), - InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), - _ => Err(PreparationError::new(outcome.message)), - }, - Err(err) => Err(PreparationError::new(err.to_string())), + for monitor in sender.service_monitor.iter() { + client + .apply(monitor, Some(&sender.namespace)) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; } + Ok(PreparationOutcome::Success { + details: "successfuly installed prometheus components".to_string(), + }) } + async fn ensure_prometheus_operator( &self, sender: &CRDPrometheus, - inventory: &Inventory, + _inventory: &Inventory, ) -> Result { let po_result = self.ensure_prometheus_operator(sender).await?; - if po_result == PreparationOutcome::Noop { - debug!("Skipping Prometheus CR installation due to missing operator."); - return Ok(po_result); - } else { - todo!() + match po_result { + PreparationOutcome::Success { details: _ } => { + debug!("Detected prometheus crds operator present in cluster."); + return Ok(po_result); + } + PreparationOutcome::Noop => { + debug!("Skipping Prometheus CR installation due to missing operator."); + return Ok(po_result); + } } } } @@ -211,6 +244,7 @@ impl PrometheusMonitoring for K8sAnywhereTopology { Err(err) => Err(PreparationError::new(err.to_string())), } } + async fn ensure_prometheus_operator( &self, sender: &RHOBObservability, @@ -300,6 +334,95 @@ impl K8sAnywhereTopology { .clone() } + fn build_grafana_datasource( + &self, + ns: &str, + label_selector: &LabelSelector, + ) -> GrafanaDatasource { + let mut json_data = BTreeMap::new(); + json_data.insert("timeInterval".to_string(), "5s".to_string()); + + let graf_data_source = GrafanaDatasource { + metadata: ObjectMeta { + name: Some(format!("grafana-datasource-{}", ns)), + namespace: Some(ns.to_string()), + ..Default::default() + }, + spec: GrafanaDatasourceSpec { + instance_selector: label_selector.clone(), + allow_cross_namespace_import: Some(false), + datasource: GrafanaDatasourceConfig { + access: "proxy".to_string(), + database: Some("prometheus".to_string()), + json_data: Some(json_data), + //this is fragile + name: format!("prometheus-{}-0", ns), + r#type: "prometheus".to_string(), + url: format!("http://prometheus-operated.{}.svc.cluster.local:9090", ns), + }, + }, + }; + graf_data_source + } + + fn build_grafana_dashboard( + &self, + ns: &str, + label_selector: &LabelSelector, + ) -> GrafanaDashboard { + let json = build_default_dashboard(ns); + let graf_dashboard = GrafanaDashboard { + metadata: ObjectMeta { + name: Some(format!("grafana-dashboard-{}", ns)), + namespace: Some(ns.to_string()), + ..Default::default() + }, + spec: GrafanaDashboardSpec { + resync_period: Some("30s".to_string()), + instance_selector: label_selector.clone(), + json, + }, + }; + graf_dashboard + } + + fn build_grafana(&self, ns: &str, labels: &BTreeMap) -> GrafanaCRD { + let grafana = GrafanaCRD { + metadata: ObjectMeta { + name: Some(format!("grafana-{}", ns)), + namespace: Some(ns.to_string()), + labels: Some(labels.clone()), + ..Default::default() + }, + spec: GrafanaSpec { + config: None, + admin_user: None, + admin_password: None, + ingress: None, + persistence: None, + resources: None, + }, + }; + grafana + } + + async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore { + let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap(); + let name = format!("{}-grafana", ns); + let backend_service = format!("grafana-{}-service", ns); + + K8sIngressScore { + name: fqdn::fqdn!(&name), + host: fqdn::fqdn!(&domain), + backend_service: fqdn::fqdn!(&backend_service), + port: 3000, + path: Some("/".to_string()), + path_type: Some(PathType::Prefix), + namespace: Some(fqdn::fqdn!(&ns)), + ingress_class_name: Some("openshift-default".to_string()), + } + } + async fn get_cluster_observability_operator_prometheus_application_score( &self, sender: RHOBObservability, @@ -607,7 +730,14 @@ impl K8sAnywhereTopology { inventory: &Inventory, ns: Option<&str>, ) -> Result { - let _grafana_operator_score = grafana_helm_chart_score(ns.unwrap(), true) + let namespace = ns.unwrap_or("grafana"); + info!("installing grafana operator in ns {namespace}"); + let tenant = self.get_k8s_tenant_manager()?.get_tenant_config().await; + let mut namespace_scope = false; + if tenant.is_some() { + namespace_scope = true; + } + let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope) .interpret(inventory, self) .await; Ok(PreparationOutcome::Success { diff --git a/harmony/src/domain/topology/oberservability/monitoring.rs b/harmony/src/domain/topology/oberservability/monitoring.rs index 1489e83..0c57ea4 100644 --- a/harmony/src/domain/topology/oberservability/monitoring.rs +++ b/harmony/src/domain/topology/oberservability/monitoring.rs @@ -30,6 +30,7 @@ impl, T: Topology> Interpret for AlertingInte inventory: &Inventory, topology: &T, ) -> Result { + debug!("hit sender configure for AlertingInterpret"); self.sender.configure(inventory, topology).await?; for receiver in self.receivers.iter() { receiver.install(&self.sender).await?; @@ -38,6 +39,7 @@ impl, T: Topology> Interpret for AlertingInte debug!("installing rule: {:#?}", rule); rule.install(&self.sender).await?; } + debug!("hit sender ensure installed for AlertingInterpret"); self.sender.ensure_installed(inventory, topology).await?; Ok(Outcome::success(format!( "successfully installed alert sender {}", diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index 0fd155d..fd6ae2a 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -2,7 +2,11 @@ use crate::modules::application::{ Application, ApplicationFeature, InstallationError, InstallationOutcome, }; use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore; +use crate::modules::monitoring::grafana::grafana::Grafana; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; +use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ + ServiceMonitor, ServiceMonitorSpec, +}; use crate::topology::MultiTargetTopology; use crate::topology::ingress::Ingress; use crate::{ @@ -22,6 +26,7 @@ use base64::{Engine as _, engine::general_purpose}; use harmony_secret::SecretManager; use harmony_secret_derive::Secret; use harmony_types::net::Url; +use kube::api::ObjectMeta; use log::{debug, info}; use serde::{Deserialize, Serialize}; use std::sync::Arc; @@ -41,6 +46,7 @@ impl< + K8sclient + MultiTargetTopology + PrometheusMonitoring + + Grafana + Ingress + std::fmt::Debug, > ApplicationFeature for Monitoring @@ -57,11 +63,20 @@ impl< .unwrap_or_else(|| self.application.name()); let domain = topology.get_domain("ntfy").await.unwrap(); + let app_service_monitor = ServiceMonitor { + metadata: ObjectMeta { + name: Some(self.application.name()), + namespace: Some(namespace.clone()), + ..Default::default() + }, + spec: ServiceMonitorSpec::default(), + }; + let mut alerting_score = ApplicationMonitoringScore { sender: CRDPrometheus { namespace: namespace.clone(), client: topology.k8s_client().await.unwrap(), - service_monitor: vec![], + service_monitor: vec![app_service_monitor], }, application: self.application.clone(), receivers: self.alert_receiver.clone(), diff --git a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs index 2780edd..0f6e0ec 100644 --- a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs @@ -1,21 +1,23 @@ use std::sync::Arc; -use async_trait::async_trait; +use log::debug; use serde::Serialize; use crate::{ - data::Version, - interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, - inventory::Inventory, + interpret::Interpret, modules::{ application::Application, - monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus, + monitoring::{ + grafana::grafana::Grafana, kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus, + }, prometheus::prometheus::PrometheusMonitoring, }, score::Score, - topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver}, + topology::{ + K8sclient, Topology, + oberservability::monitoring::{AlertReceiver, AlertingInterpret}, + }, }; -use harmony_types::id::Id; #[derive(Debug, Clone, Serialize)] pub struct ApplicationMonitoringScore { @@ -24,10 +26,15 @@ pub struct ApplicationMonitoringScore { pub receivers: Vec>>, } -impl> Score for ApplicationMonitoringScore { +impl + K8sclient + Grafana> Score + for ApplicationMonitoringScore +{ fn create_interpret(&self) -> Box> { - Box::new(ApplicationMonitoringInterpret { - score: self.clone(), + debug!("creating alerting interpret"); + Box::new(AlertingInterpret { + sender: self.sender.clone(), + receivers: self.receivers.clone(), + rules: vec![], }) } @@ -38,55 +45,3 @@ impl> Score for Application ) } } - -#[derive(Debug)] -pub struct ApplicationMonitoringInterpret { - score: ApplicationMonitoringScore, -} - -#[async_trait] -impl> Interpret - for ApplicationMonitoringInterpret -{ - async fn execute( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result { - let result = topology - .install_prometheus( - &self.score.sender, - inventory, - Some(self.score.receivers.clone()), - ) - .await; - - match result { - Ok(outcome) => match outcome { - PreparationOutcome::Success { details: _ } => { - Ok(Outcome::success("Prometheus installed".into())) - } - PreparationOutcome::Noop => { - Ok(Outcome::noop("Prometheus installation skipped".into())) - } - }, - Err(err) => Err(InterpretError::from(err)), - } - } - - fn get_name(&self) -> InterpretName { - InterpretName::ApplicationMonitoring - } - - fn get_version(&self) -> Version { - todo!() - } - - fn get_status(&self) -> InterpretStatus { - todo!() - } - - fn get_children(&self) -> Vec { - todo!() - } -} diff --git a/harmony/src/modules/monitoring/grafana/grafana.rs b/harmony/src/modules/monitoring/grafana/grafana.rs index 411d7a6..5ab57c2 100644 --- a/harmony/src/modules/monitoring/grafana/grafana.rs +++ b/harmony/src/modules/monitoring/grafana/grafana.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use k8s_openapi::Resource; use crate::{ inventory::Inventory, @@ -7,9 +8,10 @@ use crate::{ #[async_trait] pub trait Grafana { - async fn ensure_grafana_operator_ready( + async fn ensure_grafana_operator( &self, inventory: &Inventory, ) -> Result; + async fn install_grafana(&self) -> Result; } diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs index 094beca..2965ada 100644 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs @@ -1,24 +1,27 @@ use non_blank_string_rs::NonBlankString; use std::{collections::HashMap, str::FromStr}; -use crate::modules::helm::chart::HelmChartScore; +use crate::modules::helm::chart::{HelmChartScore, HelmRepository}; -pub fn grafana_helm_chart_score(ns: &str, scope: bool) -> HelmChartScore { +pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartScore { let mut values_overrides = HashMap::new(); values_overrides.insert( NonBlankString::from_str("namespaceScope").unwrap(), - scope.to_string(), + namespace_scope.to_string(), ); HelmChartScore { namespace: Some(NonBlankString::from_str(ns).unwrap()), - release_name: NonBlankString::from_str("grafana").unwrap(), - chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana-operator") - .unwrap(), + release_name: NonBlankString::from_str("grafana-operator").unwrap(), + chart_name: NonBlankString::from_str("grafana/grafana-operator").unwrap(), chart_version: None, values_overrides: Some(values_overrides), values_yaml: None, create_namespace: true, install_only: true, - repository: None, + repository: Some(HelmRepository::new( + "grafana".to_string(), + url::Url::parse("https://grafana.github.io/helm-charts").unwrap(), + true, + )), } } diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs index 0ac8fc7..ceeca41 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs @@ -68,7 +68,7 @@ impl + Grafana> In for CRDPrometheus { async fn configure(&self, inventory: &Inventory, topology: &T) -> Result<(), InterpretError> { - topology.ensure_grafana_operator_ready(inventory).await?; + topology.ensure_grafana_operator(inventory).await?; topology.ensure_prometheus_operator(self, inventory).await?; Ok(()) } From dd3f07e5b73ab51be1b01543086b22acd6d35795 Mon Sep 17 00:00:00 2001 From: Willem Date: Thu, 9 Oct 2025 15:28:42 -0400 Subject: [PATCH 03/51] doc for removing worker flag from cp on UPI --- docs/doc-remove-worker-flag.md | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/doc-remove-worker-flag.md diff --git a/docs/doc-remove-worker-flag.md b/docs/doc-remove-worker-flag.md new file mode 100644 index 0000000..bdb45a7 --- /dev/null +++ b/docs/doc-remove-worker-flag.md @@ -0,0 +1,58 @@ +1. ### **Titre : Retrait du flag *worker* sur les control planes (UPI)** + +1. ### **Contexte** + Dans certaines installations OpenShift UPI, les nodes de control plane (masters) héritent par erreur du label worker (node-role.kubernetes.io/worker).\ + Cela provoque la planification de workloads non critiques (par ex. routers, Ceph pods, etc.) sur les control planes, ce qui compromet la stabilité et la séparation des rôles. + +1. ### **Symptômes observés** +- Apres avoir ajouté des serveur dans HAProxy, tous les serveurs backend (wk0, wk1, wk2) apparaissent en état DOWN.\ + Le trafic HTTP/HTTPS est redirigé vers les control planes au lieu des workers. +- Les pods router-default sont déployés sur cp1 et cp2 plutôt que sur les workers. +- Sur les masters, la commande suivante montre une écoute sur le port 80 : + + ss -tlnp | grep 80 + + -> processus haproxy en écoute sur 0.0.0.0:80 + + -> meme chose pour port 443 + +- Dans le namespace rook-ceph, certains pods (mon, mgr, operator) ne se planifient pas, sont aussi deployé sur les cp au lieu des worker nodes : + +1. ### **Cause** + En installation UPI, les rôles (master, worker) ne sont pas gérés par le Machine Config Operator (MCO).\ + Les controls planes sont schedulable par default. Qui amene les trois roles, worker, master et control-plane. + +1. ### **Diagnostic** +1. Vérifier les labels du node : + + oc get nodes --show-labels | grep control-plane + +1. Inspecter la configuration du kubelet : + + cat /etc/systemd/system/kubelet.service + + Rechercher la ligne : + + --node-labels=node-role.kubernetes.io/control-plane,node-role.kubernetes.io/master,node-role.kubernetes.io/worker + + → présence du label worker confirme le problème. + +1. Vérifier que ce flag ne provient pas du MCO : + + oc get machineconfig | grep rendered-master + +**Solution:**\ +Pour rendre les **control planes non planifiables** (c’est-à-dire empêcher tout déploiement de workloads dessus), il faut appliquer le patch suivant sur la ressource scheduler du cluster :\ +\``` +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oc patch scheduler cluster --type merge -p '{"spec":{"mastersSchedulable":false}}'\ +\```\ +Cette commande **désactive la planification sur les masters** et **supprime efficacement le rôle worker** de leurs fonctions. + +Une fois le patch appliqué, il faut **déplacer les workloads** encore présents sur les control planes vers les **workers** à l’aide des commandes : + +\```\ +oc adm cordon \ +oc adm drain --ignore-daemonsets –delete-emptydir-data\ +\``` + From e5eb7fde9fb938d2b32e1af6a6675857909b232d Mon Sep 17 00:00:00 2001 From: Willem Date: Thu, 9 Oct 2025 15:29:09 -0400 Subject: [PATCH 04/51] doc to clone and transfer a coreos disk --- docs/doc-clone-et-restaure-disque-coreos.md | 117 ++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 docs/doc-clone-et-restaure-disque-coreos.md diff --git a/docs/doc-clone-et-restaure-disque-coreos.md b/docs/doc-clone-et-restaure-disque-coreos.md new file mode 100644 index 0000000..b1e9108 --- /dev/null +++ b/docs/doc-clone-et-restaure-disque-coreos.md @@ -0,0 +1,117 @@ +1. ### **Procédure de clonage et de restauration d’un disque CoreOS / Fedora OKD** + Ce processus décrit les étapes pour copier un disque système défectueux sur un nouveau disque d’entreprise, en conservant les **GUID**, **labels**, et **UUID** d’origine pour assurer la compatibilité avec le système CoreOS/OKD. + +1. ### **Étape 1 — Sauvegarde initiale** + Avant toute manipulation, **sauvegardez vos données**.\ + Ensuite, clonez le disque d’origine vers le nouveau : + + sudo dd if=/dev/old of=/dev/new bs=64K status=progress count=1000Mib + +1. ### **Étape 2 — Vérification et modification des partitions** + Afficher la table des partitions du nouveau disque : + + sgdisk -p /dev/new + + Modifier les partitions (si nécessaire) : + + gdisk /dev/new + + Dans gdisk, utiliser : + +- v → vérifier la table +- p → afficher la table +- d → supprimer une partition +- n → recréer la partition (même numéro et type) + - Pour le **secteur de fin**, appuyer sur **Entrée** pour utiliser l’espace maximal. +- w → écrire les changements + +Créer le système de fichiers XFS sur la nouvelle partition (ex. partition 4) : + +sudo mkfs.xfs -f /dev/new4 + +1. ### **Étape 3 — Récupération des identifiants de l’ancien disque** + Obtenir le **GUID de partition** d’origine : + + sgdisk -i /dev/old\_disk + + Lister les labels et les PARTUUIDs : + + sgdisk -p /dev/old\_disk + + blkid /dev/old\_disk\* + +1. ### **Étape 4 — Appliquer les anciens identifiants sur le nouveau disque** + Définir le même **PARTUUID** : + + sgdisk -u : /dev/new + + Définir le même **nom de partition** : + + sgdisk -c :"" /dev/new + + Vérifier : + + lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/old\_disk + + lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/new + +1. ### **Étape 5 — Copier les données** + Monter les partitions avant la copie : + + mkdir -p /mnt/old /mnt/new + + mount /dev/old4 /mnt/old + + mount /dev/new4 /mnt/new + + Copier les données : + + rsync -aAXHv --numeric-ids /mnt/old/ /mnt/new/ + +1. ### **Étape 6 — Restaurer UUID et labels** + Obtenir l’ancien UUID : + + blkid /dev/old4 + + Le définir sur la nouvelle partition : + + sudo xfs\_admin -U /dev/new4 + + Vérifier et copier le **label** : + + sgdisk -i 4 /dev/old\_disk | grep "Partition name" + + sudo xfs\_admin -L /dev/new4 + +1. ### **Étape 7 — Validation** + Comparer les deux disques : + + sgdisk -p /dev/old\_disk + + sgdisk -p /dev/new + + lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/old\_disk + + lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/new + + blkid /dev/old\_disk\* | grep UUID= + + blkid /dev/new\* | grep UUID= + +1. ### **Étape 8 — Finalisation** + Démonter les partitions : + + umount /mnt/new + + umount /mnt/old + + Éteindre, **échanger les disques**, et vérifier le démarrage : + +1. Éteindre la machine. +1. Retirer le disque défectueux. +1. Définir le nouveau disque comme disque de démarrage principal dans le BIOS. +1. Redémarrer et confirmer que le système démarre correctement. + +**Résultat attendu :**\ +Le nouveau disque est une copie fonctionnelle de l’ancien, avec partitions, labels, et UUID identiques. Aucun réajustement GRUB ni réinstallation n’est nécessaire pour Fedora CoreOS/OKD. + From 85bec66e5878f0eb9b496b37be614ab7d2f904e3 Mon Sep 17 00:00:00 2001 From: Willem Date: Fri, 10 Oct 2025 12:09:26 -0400 Subject: [PATCH 05/51] wip: fixing grafana datasource for openshift which requires creating a token, sa, secret and inserting them into the grafanadatasource --- harmony/src/domain/topology/k8s_anywhere.rs | 140 ++++++++++++++++-- .../kube_prometheus/crd/crd_grafana.rs | 34 ++++- 2 files changed, 159 insertions(+), 15 deletions(-) diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index 895f7da..efbe33f 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -1,7 +1,15 @@ use std::{collections::BTreeMap, process::Command, sync::Arc}; use async_trait::async_trait; -use kube::api::{GroupVersionKind, ObjectMeta}; +use k8s_openapi::api::{ + authentication::v1::{TokenRequest, TokenRequestSpec}, + core::v1::{Secret, ServiceAccount}, + rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, +}; +use kube::{ + Api, + api::{GroupVersionKind, ObjectMeta, PostParams}, +}; use log::{debug, info, warn}; use serde::Serialize; use tokio::sync::OnceCell; @@ -19,12 +27,14 @@ use crate::{ crd_alertmanager_config::CRDPrometheus, crd_grafana::{ Grafana as GrafanaCRD, GrafanaDashboard, GrafanaDashboardSpec, - GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceSpec, GrafanaSpec, + GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceJsonData, + GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec, }, crd_prometheuses::LabelSelector, grafana_default_dashboard::build_default_dashboard, prometheus_operator::prometheus_operator_helm_chart_score, rhob_alertmanager_config::RHOBObservability, + role::build_prom_service_account, service_monitor::ServiceMonitor, }, }, @@ -142,8 +152,26 @@ impl Grafana for K8sAnywhereTopology { }; let client = self.k8s_client().await?; + let url = format!("{}:9091", self.get_domain("thanos-querier").await.unwrap()); + + let sa = self.build_service_account(); + //TODO finish this section + //needs apply Api or something + client.apply(&sa, Some(ns)).await?; - let datasource = self.build_grafana_datasource(ns, &label_selector); + let token_request =self.get_token_request(); + //this wont work needs a new function for apply secret + client.apply(&token_request, Some(ns)).await?; + + let clusterrolebinding = self.build_cluster_rolebinding(); + + client.apply(&clusterrolebinding, Some(ns)).await?; + + let secret = self.build_token_secret(); + + client.apply(&secret, Some(ns)).await?; + + let datasource = self.build_grafana_datasource(ns, &label_selector, &url); client.apply(&datasource, Some(ns)).await?; @@ -334,35 +362,121 @@ impl K8sAnywhereTopology { .clone() } + pub fn build_service_account(&self, name: &str, namespace: &str) -> ServiceAccount { + build_prom_service_account(name.to_string(), namespace.to_string()) + } + + pub fn build_cluster_rolebinding( + &self, + ns: &str, + account_name: &str, + role: &str, + ) -> ClusterRoleBinding { + ClusterRoleBinding { + metadata: ObjectMeta { + name: Some(format!("{}-view-binding", account_name)), + ..Default::default() + }, + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".into(), + kind: "ClusterRole".into(), + name: role.into(), + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".into(), + name: account_name.into(), + namespace: Some(ns.into()), + ..Default::default() + }]), + } + } + + pub fn get_token_request(&self) -> TokenRequest { + TokenRequest { + spec: TokenRequestSpec { + audiences: vec!["https://kubernetes.default.svc".to_string()], + expiration_seconds: Some(3600), + ..Default::default() + }, + ..Default::default() + } + } + + pub fn build_token_secret(&self, token: &str, ns: &str) -> Secret { + Secret { + metadata: ObjectMeta { + name: Some("grafana-credentials".into()), + namespace: Some(ns.into()), + ..Default::default() + }, + string_data: Some(std::collections::BTreeMap::from([( + "PROMETHEUS_TOKEN".into(), + format!("Bearer {}", token), + )])), + ..Default::default() + } + } + fn build_grafana_datasource( &self, ns: &str, label_selector: &LabelSelector, + url: &str, ) -> GrafanaDatasource { let mut json_data = BTreeMap::new(); json_data.insert("timeInterval".to_string(), "5s".to_string()); + // + // let graf_data_source = GrafanaDatasource { + // metadata: ObjectMeta { + // name: Some(format!("grafana-datasource-{}", ns)), + // namespace: Some(ns.to_string()), + // ..Default::default() + // }, + // spec: GrafanaDatasourceSpec { + // instance_selector: label_selector.clone(), + // allow_cross_namespace_import: Some(false), + // datasource: GrafanaDatasourceConfig { + // access: "proxy".to_string(), + // database: Some("prometheus".to_string()), + // json_data: Some(json_data), + // //this is fragile + // name: format!("prometheus-{}-0", ns), + // r#type: "prometheus".to_string(), + // url: url.to_string(), + // //url: format!("http://prometheus-operated.{}.svc.cluster.local:9090", ns), + // }, + // }, + // }; + // graf_data_source - let graf_data_source = GrafanaDatasource { + GrafanaDatasource { metadata: ObjectMeta { - name: Some(format!("grafana-datasource-{}", ns)), + name: Some("thanos-prometheus".to_string()), namespace: Some(ns.to_string()), ..Default::default() }, spec: GrafanaDatasourceSpec { instance_selector: label_selector.clone(), - allow_cross_namespace_import: Some(false), + allow_cross_namespace_import: Some(true), datasource: GrafanaDatasourceConfig { access: "proxy".to_string(), - database: Some("prometheus".to_string()), - json_data: Some(json_data), - //this is fragile - name: format!("prometheus-{}-0", ns), + name: "OpenShift-Thanos".to_string(), r#type: "prometheus".to_string(), - url: format!("http://prometheus-operated.{}.svc.cluster.local:9090", ns), + url: url.to_string(), + database: None, + json_data: Some(GrafanaDatasourceJsonData { + time_interval: Some("60s".to_string()), + http_header_name1: Some("Authorization".to_string()), + }), + secure_json_data: Some(GrafanaDatasourceSecureJsonData { + http_header_value1: Some("Bearer eyJhbGc...".to_string()), + }), + is_default: Some(false), + editable: Some(true), + version: Some(1), }, }, - }; - graf_data_source + } } fn build_grafana_dashboard( diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index 793f639..4134670 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -133,13 +133,43 @@ pub struct GrafanaDatasourceSpec { pub struct GrafanaDatasourceConfig { pub access: String, pub database: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub json_data: Option>, pub name: String, pub r#type: String, pub url: String, + /// Represents jsonData in the GrafanaDatasource spec + #[serde(default, skip_serializing_if = "Option::is_none")] + pub json_data: Option, + + /// Represents secureJsonData (secrets) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub secure_json_data: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub is_default: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub editable: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub version: Option, } +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceJsonData { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub time_interval: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub http_header_name1: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceSecureJsonData { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub http_header_value1: Option, +} // ------------------------------------------------------------------------------------------------ #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] From 06a0c44c3cfa982f0ce585ad815caec4270208f7 Mon Sep 17 00:00:00 2001 From: Willem Date: Tue, 14 Oct 2025 15:53:42 -0400 Subject: [PATCH 06/51] wip: connected the thanos-datasource to grafana, need to complete connecting the openshift-userworkload-monitoring as well --- harmony/src/domain/topology/k8s.rs | 17 +- harmony/src/domain/topology/k8s_anywhere.rs | 194 ++++++++++++------ .../kube_prometheus/crd/crd_grafana.rs | 12 +- .../k8s_prometheus_alerting_score.rs | 16 +- 4 files changed, 165 insertions(+), 74 deletions(-) diff --git a/harmony/src/domain/topology/k8s.rs b/harmony/src/domain/topology/k8s.rs index 144533c..f1a783f 100644 --- a/harmony/src/domain/topology/k8s.rs +++ b/harmony/src/domain/topology/k8s.rs @@ -1,12 +1,20 @@ use derive_new::new; +use http::StatusCode; use k8s_openapi::{ ClusterResourceScope, NamespaceResourceScope, - api::{apps::v1::Deployment, core::v1::Pod}, + api::{ + apps::v1::Deployment, + authentication::v1::{TokenRequest, TokenRequestSpec, TokenRequestStatus}, + core::v1::{Pod, ServiceAccount}, + }, apimachinery::pkg::version::Info, }; use kube::{ Client, Config, Discovery, Error, Resource, - api::{Api, AttachParams, DeleteParams, ListParams, Patch, PatchParams, ResourceExt}, + api::{ + Api, AttachParams, DeleteParams, ListParams, ObjectMeta, Patch, PatchParams, PostParams, + ResourceExt, + }, config::{KubeConfigOptions, Kubeconfig}, core::ErrorResponse, runtime::reflector::Lookup, @@ -54,6 +62,11 @@ impl K8sClient { }) } + pub async fn service_account_api(&self, namespace: &str) -> Api { + let api: Api = Api::namespaced(self.client.clone(), namespace); + api + } + pub async fn get_apiserver_version(&self) -> Result { let client: Client = self.client.clone(); let version_info: Info = client.apiserver_version().await?; diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index efbe33f..cb37ece 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -1,8 +1,12 @@ -use std::{collections::BTreeMap, process::Command, sync::Arc}; +use std::{ + collections::{BTreeMap, HashMap}, + process::Command, + sync::Arc, +}; use async_trait::async_trait; use k8s_openapi::api::{ - authentication::v1::{TokenRequest, TokenRequestSpec}, + authentication::v1::{TokenRequest, TokenRequestSpec, TokenRequestStatus}, core::v1::{Secret, ServiceAccount}, rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, }; @@ -150,39 +154,90 @@ impl Grafana for K8sAnywhereTopology { match_labels: label.clone(), match_expressions: vec![], }; - + debug!("getting client"); let client = self.k8s_client().await?; - let url = format!("{}:9091", self.get_domain("thanos-querier").await.unwrap()); - - let sa = self.build_service_account(); - //TODO finish this section - //needs apply Api or something - client.apply(&sa, Some(ns)).await?; - - let token_request =self.get_token_request(); - //this wont work needs a new function for apply secret - client.apply(&token_request, Some(ns)).await?; - - let clusterrolebinding = self.build_cluster_rolebinding(); - - client.apply(&clusterrolebinding, Some(ns)).await?; - - let secret = self.build_token_secret(); - - client.apply(&secret, Some(ns)).await?; - - let datasource = self.build_grafana_datasource(ns, &label_selector, &url); - - client.apply(&datasource, Some(ns)).await?; - - let dashboard = self.build_grafana_dashboard(ns, &label_selector); - - client.apply(&dashboard, Some(ns)).await?; + info!("creating grafanas crd"); let grafana = self.build_grafana(ns, &label); client.apply(&grafana, Some(ns)).await?; + client + .wait_until_deployment_ready( + "grafana-grafana-deployment".to_string(), + Some("grafana"), + Some(15), + ) + .await?; + + let sa_name = "grafana-grafana-sa"; + + debug!("creating token for sevice account {sa_name}"); + let token = self.create_service_account_token(sa_name, ns).await?; + + debug!("creating secret"); + let secret_name = "grafana-sa-secret"; + let secret = self.build_token_secret(secret_name, &token.token, ns).await; + + client.apply(&secret, Some(ns)).await?; + + debug!("creating grafana clusterrole binding"); + let clusterrolebinding = + self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns); + + client.apply(&clusterrolebinding, Some(ns)).await?; + + debug!("creating grafana datasource crd"); + + let token_str = format!("Bearer {}", token.token); + + let thanos_url = format!( + "https://{}", + self.get_domain("thanos-querier-openshift-monitoring") + .await + .unwrap() + ); + + let thanos_openshift_datasource = self.build_grafana_datasource( + "thanos-openshift-monitoring", + ns, + &label_selector, + &thanos_url, + token_str.clone(), + ); + + client.apply(&thanos_openshift_datasource, Some(ns)).await?; + + //TODO user workload datasource returns 503 -> need to figure out how to correctly add the + //userworkload thanos-ruler or prometheus-federate to the grafana datasource + //it may alrady be included in the overall monitoring stack + + let user_thanos_url = format!( + "https://{}", + self.get_domain( + "thanos-ruler-openshift-user-workload-monitoring.apps.ncd0.harmony.mcd" + ) + .await + .unwrap() + ); + + let thanos_openshift_userworkload_datasource = self.build_grafana_datasource( + "thanos-openshift-userworkload-monitoring", + ns, + &label_selector, + &user_thanos_url, + token_str.clone(), + ); + + client + .apply(&thanos_openshift_userworkload_datasource, Some(ns)) + .await?; + + debug!("creating grafana dashboard crd"); + let dashboard = self.build_grafana_dashboard(ns, &label_selector); + + client.apply(&dashboard, Some(ns)).await?; + debug!("creating grafana ingress"); let grafana_ingress = self.build_grafana_ingress(ns).await; grafana_ingress @@ -368,31 +423,36 @@ impl K8sAnywhereTopology { pub fn build_cluster_rolebinding( &self, + service_account_name: &str, + clusterrole_name: &str, ns: &str, - account_name: &str, - role: &str, ) -> ClusterRoleBinding { ClusterRoleBinding { metadata: ObjectMeta { - name: Some(format!("{}-view-binding", account_name)), + name: Some(format!("{}-view-binding", service_account_name)), ..Default::default() }, role_ref: RoleRef { api_group: "rbac.authorization.k8s.io".into(), kind: "ClusterRole".into(), - name: role.into(), + name: clusterrole_name.into(), }, subjects: Some(vec![Subject { kind: "ServiceAccount".into(), - name: account_name.into(), + name: service_account_name.into(), namespace: Some(ns.into()), ..Default::default() }]), } } - pub fn get_token_request(&self) -> TokenRequest { + pub fn get_token_request(&self, ns: &str) -> TokenRequest { + debug!("building token request"); TokenRequest { + metadata: ObjectMeta { + namespace: Some(ns.to_string()), + ..Default::default() + }, spec: TokenRequestSpec { audiences: vec!["https://kubernetes.default.svc".to_string()], expiration_seconds: Some(3600), @@ -402,15 +462,39 @@ impl K8sAnywhereTopology { } } - pub fn build_token_secret(&self, token: &str, ns: &str) -> Secret { + pub async fn create_service_account_token( + &self, + service_account_name: &str, + ns: &str, + ) -> Result { + debug!("creating service account token"); + let token_request = self.get_token_request(ns); + let client = self.k8s_client().await?; + let pp = PostParams::default(); + let token_requests_api = client.service_account_api(ns).await; + + let data = serde_json::to_vec(&token_request).unwrap(); + + let created_token_request = token_requests_api + .create_subresource::("token", service_account_name, &pp, data) + .await?; + + let status = created_token_request + .status + .ok_or_else(|| PreparationError::new("missing token request status".to_string()))?; + + Ok(status) + } + + pub async fn build_token_secret(&self, secret_name: &str, token: &str, ns: &str) -> Secret { Secret { metadata: ObjectMeta { - name: Some("grafana-credentials".into()), + name: Some(secret_name.into()), namespace: Some(ns.into()), ..Default::default() }, string_data: Some(std::collections::BTreeMap::from([( - "PROMETHEUS_TOKEN".into(), + secret_name.into(), format!("Bearer {}", token), )])), ..Default::default() @@ -419,39 +503,18 @@ impl K8sAnywhereTopology { fn build_grafana_datasource( &self, + name: &str, ns: &str, label_selector: &LabelSelector, url: &str, + token: String, ) -> GrafanaDatasource { let mut json_data = BTreeMap::new(); json_data.insert("timeInterval".to_string(), "5s".to_string()); - // - // let graf_data_source = GrafanaDatasource { - // metadata: ObjectMeta { - // name: Some(format!("grafana-datasource-{}", ns)), - // namespace: Some(ns.to_string()), - // ..Default::default() - // }, - // spec: GrafanaDatasourceSpec { - // instance_selector: label_selector.clone(), - // allow_cross_namespace_import: Some(false), - // datasource: GrafanaDatasourceConfig { - // access: "proxy".to_string(), - // database: Some("prometheus".to_string()), - // json_data: Some(json_data), - // //this is fragile - // name: format!("prometheus-{}-0", ns), - // r#type: "prometheus".to_string(), - // url: url.to_string(), - // //url: format!("http://prometheus-operated.{}.svc.cluster.local:9090", ns), - // }, - // }, - // }; - // graf_data_source GrafanaDatasource { metadata: ObjectMeta { - name: Some("thanos-prometheus".to_string()), + name: Some(name.to_string()), namespace: Some(ns.to_string()), ..Default::default() }, @@ -460,20 +523,21 @@ impl K8sAnywhereTopology { allow_cross_namespace_import: Some(true), datasource: GrafanaDatasourceConfig { access: "proxy".to_string(), - name: "OpenShift-Thanos".to_string(), + name: name.to_string(), r#type: "prometheus".to_string(), url: url.to_string(), database: None, json_data: Some(GrafanaDatasourceJsonData { time_interval: Some("60s".to_string()), http_header_name1: Some("Authorization".to_string()), + tls_skip_verify: Some(true), + oauth_pass_thru: Some(true), }), secure_json_data: Some(GrafanaDatasourceSecureJsonData { - http_header_value1: Some("Bearer eyJhbGc...".to_string()), + http_header_value1: Some(token), }), is_default: Some(false), editable: Some(true), - version: Some(1), }, }, } diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index 4134670..e58f4ca 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -132,6 +132,7 @@ pub struct GrafanaDatasourceSpec { #[serde(rename_all = "camelCase")] pub struct GrafanaDatasourceConfig { pub access: String, + #[serde(default, skip_serializing_if = "Option::is_none")] pub database: Option, pub name: String, pub r#type: String, @@ -149,9 +150,6 @@ pub struct GrafanaDatasourceConfig { #[serde(default, skip_serializing_if = "Option::is_none")] pub editable: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub version: Option, } #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] @@ -162,6 +160,14 @@ pub struct GrafanaDatasourceJsonData { #[serde(default, skip_serializing_if = "Option::is_none")] pub http_header_name1: Option, + + /// Disable TLS skip verification (false = verify) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub tls_skip_verify: Option, + + /// Auth type - set to "forward" for OpenShift OAuth identity + #[serde(default, skip_serializing_if = "Option::is_none")] + pub oauth_pass_thru: Option, } #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index 2cb4ffb..f9e8531 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -12,7 +12,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::C use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{ Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, - GrafanaDatasourceSpec, GrafanaSpec, + GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSpec, }; use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ PrometheusRule, PrometheusRuleSpec, RuleGroup, @@ -466,10 +466,15 @@ impl K8sPrometheusCRDAlertingInterpret { match_labels: label.clone(), match_expressions: vec![], }; - let mut json_data = BTreeMap::new(); - json_data.insert("timeInterval".to_string(), "5s".to_string()); + // let mut json_data = BTreeMap::new(); + // json_data.insert("timeInterval".to_string(), "5s".to_string()); let namespace = self.sender.namespace.clone(); - + let json_data = GrafanaDatasourceJsonData { + time_interval: Some("5s".to_string()), + http_header_name1: None, + tls_skip_verify: Some(true), + oauth_pass_thru: Some(true), + }; let json = build_default_dashboard(&namespace); let graf_data_source = GrafanaDatasource { @@ -495,6 +500,9 @@ impl K8sPrometheusCRDAlertingInterpret { "http://prometheus-operated.{}.svc.cluster.local:9090", self.sender.namespace.clone() ), + secure_json_data: None, + is_default: None, + editable: None, }, }, }; From 7dff70edcf459751b9656184bccc157ebc88ce2a Mon Sep 17 00:00:00 2001 From: Willem Date: Wed, 15 Oct 2025 15:26:36 -0400 Subject: [PATCH 07/51] wip: fixed token expiration and configured grafana dashboard --- harmony/src/domain/topology/k8s_anywhere.rs | 108 ++++++++++-------- .../kube_prometheus/crd/crd_grafana.rs | 51 ++++++++- .../k8s_prometheus_alerting_score.rs | 8 +- 3 files changed, 116 insertions(+), 51 deletions(-) diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index cb37ece..cb4ab2d 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -6,7 +6,9 @@ use std::{ use async_trait::async_trait; use k8s_openapi::api::{ - authentication::v1::{TokenRequest, TokenRequestSpec, TokenRequestStatus}, + authentication::v1::{ + BoundObjectReference, TokenRequest, TokenRequestSpec, TokenRequestStatus, + }, core::v1::{Secret, ServiceAccount}, rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, }; @@ -30,9 +32,11 @@ use crate::{ kube_prometheus::crd::{ crd_alertmanager_config::CRDPrometheus, crd_grafana::{ - Grafana as GrafanaCRD, GrafanaDashboard, GrafanaDashboardSpec, - GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceJsonData, - GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec, + Grafana as GrafanaCRD, GrafanaCom, GrafanaDashboard, + GrafanaDashboardDatasource, GrafanaDashboardSpec, GrafanaDatasource, + GrafanaDatasourceConfig, GrafanaDatasourceJsonData, + GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, + GrafanaSpec, GrafanaValueFrom, GrafanaValueSource, }, crd_prometheuses::LabelSelector, grafana_default_dashboard::build_default_dashboard, @@ -166,22 +170,24 @@ impl Grafana for K8sAnywhereTopology { .wait_until_deployment_ready( "grafana-grafana-deployment".to_string(), Some("grafana"), - Some(15), + Some(30), ) .await?; let sa_name = "grafana-grafana-sa"; - debug!("creating token for sevice account {sa_name}"); - let token = self.create_service_account_token(sa_name, ns).await?; + let token_secret_name = "grafana-sa-token-secret"; - debug!("creating secret"); - let secret_name = "grafana-sa-secret"; - let secret = self.build_token_secret(secret_name, &token.token, ns).await; + // let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns); + // + // client.apply(&sa_token_secret, Some(ns)).await?; + let secret = self.build_token_secret(token_secret_name, ns).await; client.apply(&secret, Some(ns)).await?; + let token_request_status = self.create_service_account_token(sa_name, ns).await?; debug!("creating grafana clusterrole binding"); + let clusterrolebinding = self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns); @@ -189,7 +195,7 @@ impl Grafana for K8sAnywhereTopology { debug!("creating grafana datasource crd"); - let token_str = format!("Bearer {}", token.token); + // let token_str = format!("Bearer {}", token.token); let thanos_url = format!( "https://{}", @@ -203,36 +209,11 @@ impl Grafana for K8sAnywhereTopology { ns, &label_selector, &thanos_url, - token_str.clone(), + &token_request_status.token, // Pass the secret name here ); client.apply(&thanos_openshift_datasource, Some(ns)).await?; - //TODO user workload datasource returns 503 -> need to figure out how to correctly add the - //userworkload thanos-ruler or prometheus-federate to the grafana datasource - //it may alrady be included in the overall monitoring stack - - let user_thanos_url = format!( - "https://{}", - self.get_domain( - "thanos-ruler-openshift-user-workload-monitoring.apps.ncd0.harmony.mcd" - ) - .await - .unwrap() - ); - - let thanos_openshift_userworkload_datasource = self.build_grafana_datasource( - "thanos-openshift-userworkload-monitoring", - ns, - &label_selector, - &user_thanos_url, - token_str.clone(), - ); - - client - .apply(&thanos_openshift_userworkload_datasource, Some(ns)) - .await?; - debug!("creating grafana dashboard crd"); let dashboard = self.build_grafana_dashboard(ns, &label_selector); @@ -446,6 +427,30 @@ impl K8sAnywhereTopology { } } + pub fn build_sa_token_secret( + &self, + secret_name: &str, + service_account_name: &str, + ns: &str, + ) -> Secret { + let mut annotations = BTreeMap::new(); + annotations.insert( + "kubernetes.io/service-account.name".to_string(), + service_account_name.to_string(), + ); + + Secret { + metadata: ObjectMeta { + name: Some(secret_name.into()), + namespace: Some(ns.into()), + annotations: Some(annotations), + ..Default::default() + }, + type_: Some("kubernetes.io/service-account-token".to_string()), + ..Default::default() + } + } + pub fn get_token_request(&self, ns: &str) -> TokenRequest { debug!("building token request"); TokenRequest { @@ -456,7 +461,11 @@ impl K8sAnywhereTopology { spec: TokenRequestSpec { audiences: vec!["https://kubernetes.default.svc".to_string()], expiration_seconds: Some(3600), - ..Default::default() + bound_object_ref: Some(BoundObjectReference { + kind: Some("Secret".to_string()), + name: Some("grafana-sa-token-secret".to_string()), + ..Default::default() + }), }, ..Default::default() } @@ -486,17 +495,14 @@ impl K8sAnywhereTopology { Ok(status) } - pub async fn build_token_secret(&self, secret_name: &str, token: &str, ns: &str) -> Secret { + pub async fn build_token_secret(&self, secret_name: &str, ns: &str) -> Secret { Secret { metadata: ObjectMeta { name: Some(secret_name.into()), namespace: Some(ns.into()), ..Default::default() }, - string_data: Some(std::collections::BTreeMap::from([( - secret_name.into(), - format!("Bearer {}", token), - )])), + string_data: None, ..Default::default() } } @@ -507,7 +513,7 @@ impl K8sAnywhereTopology { ns: &str, label_selector: &LabelSelector, url: &str, - token: String, + token: &str, // Pass in the secret name ) -> GrafanaDatasource { let mut json_data = BTreeMap::new(); json_data.insert("timeInterval".to_string(), "5s".to_string()); @@ -521,6 +527,7 @@ impl K8sAnywhereTopology { spec: GrafanaDatasourceSpec { instance_selector: label_selector.clone(), allow_cross_namespace_import: Some(true), + values_from: None, datasource: GrafanaDatasourceConfig { access: "proxy".to_string(), name: name.to_string(), @@ -534,7 +541,7 @@ impl K8sAnywhereTopology { oauth_pass_thru: Some(true), }), secure_json_data: Some(GrafanaDatasourceSecureJsonData { - http_header_value1: Some(token), + http_header_value1: Some(format!("Bearer {token}")), }), is_default: Some(false), editable: Some(true), @@ -548,7 +555,6 @@ impl K8sAnywhereTopology { ns: &str, label_selector: &LabelSelector, ) -> GrafanaDashboard { - let json = build_default_dashboard(ns); let graf_dashboard = GrafanaDashboard { metadata: ObjectMeta { name: Some(format!("grafana-dashboard-{}", ns)), @@ -558,7 +564,15 @@ impl K8sAnywhereTopology { spec: GrafanaDashboardSpec { resync_period: Some("30s".to_string()), instance_selector: label_selector.clone(), - json, + datasources: Some(vec![GrafanaDashboardDatasource { + input_name: "DS_PROMETHEUS".to_string(), + datasource_name: "thanos-openshift-monitoring".to_string(), + }]), + json: None, + grafana_com: Some(GrafanaCom { + id: 17406, + revision: None, + }), }, }; graf_dashboard diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index e58f4ca..c99adc1 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -103,9 +103,34 @@ pub struct GrafanaDashboardSpec { #[serde(default, skip_serializing_if = "Option::is_none")] pub resync_period: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub datasources: Option>, + pub instance_selector: LabelSelector, - pub json: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub json: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub grafana_com: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDashboardDatasource { + pub input_name: String, + pub datasource_name: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaCom { + pub id: u32, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub revision: Option, } // ------------------------------------------------------------------------------------------------ @@ -126,6 +151,30 @@ pub struct GrafanaDatasourceSpec { pub allow_cross_namespace_import: Option, pub datasource: GrafanaDatasourceConfig, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub values_from: Option>, +} + + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaValueFrom { + pub target_path: String, + pub value_from: GrafanaValueSource, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaValueSource { + pub secret_key_ref: GrafanaSecretKeyRef, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSecretKeyRef { + pub name: String, + pub key: String, } #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index f9e8531..7873235 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -11,8 +11,7 @@ use std::process::Command; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{ - Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, - GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSpec, + Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, GrafanaSpec, GrafanaValueFrom, GrafanaValueSource }; use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ PrometheusRule, PrometheusRuleSpec, RuleGroup, @@ -504,6 +503,7 @@ impl K8sPrometheusCRDAlertingInterpret { is_default: None, editable: None, }, + values_from: None, }, }; @@ -524,7 +524,9 @@ impl K8sPrometheusCRDAlertingInterpret { spec: GrafanaDashboardSpec { resync_period: Some("30s".to_string()), instance_selector: labels.clone(), - json, + json: Some(json), + grafana_com: None, + datasources: None, }, }; From fc384599a1a31d2678cc11c1f9b067b146afae46 Mon Sep 17 00:00:00 2001 From: Willem Date: Thu, 16 Oct 2025 14:07:23 -0400 Subject: [PATCH 08/51] feat: implementation of Installable for CRDPrometheusIntroduction of Grafana trait and its impl for k8sanywhereallows for CRDPrometheus to be installed via AlertingInterpret which standardizes the installation of alert receivers, alerting rules, and alert senders --- harmony/src/domain/topology/k8s_anywhere.rs | 131 ++++++------------ .../kube_prometheus/crd/crd_grafana.rs | 3 +- .../k8s_prometheus_alerting_score.rs | 4 +- 3 files changed, 45 insertions(+), 93 deletions(-) diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index cb4ab2d..e45b65f 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -1,21 +1,12 @@ -use std::{ - collections::{BTreeMap, HashMap}, - process::Command, - sync::Arc, -}; +use std::{collections::BTreeMap, process::Command, sync::Arc}; use async_trait::async_trait; +use base64::{Engine, engine::general_purpose}; use k8s_openapi::api::{ - authentication::v1::{ - BoundObjectReference, TokenRequest, TokenRequestSpec, TokenRequestStatus, - }, - core::v1::{Secret, ServiceAccount}, + core::v1::Secret, rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, }; -use kube::{ - Api, - api::{GroupVersionKind, ObjectMeta, PostParams}, -}; +use kube::api::{DynamicObject, GroupVersionKind, ObjectMeta}; use log::{debug, info, warn}; use serde::Serialize; use tokio::sync::OnceCell; @@ -35,14 +26,11 @@ use crate::{ Grafana as GrafanaCRD, GrafanaCom, GrafanaDashboard, GrafanaDashboardDatasource, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceJsonData, - GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, - GrafanaSpec, GrafanaValueFrom, GrafanaValueSource, + GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec, }, crd_prometheuses::LabelSelector, - grafana_default_dashboard::build_default_dashboard, prometheus_operator::prometheus_operator_helm_chart_score, rhob_alertmanager_config::RHOBObservability, - role::build_prom_service_account, service_monitor::ServiceMonitor, }, }, @@ -148,24 +136,23 @@ impl Grafana for K8sAnywhereTopology { }; } async fn install_grafana(&self) -> Result { - debug!("install grafana"); let ns = "grafana"; let mut label = BTreeMap::new(); label.insert("dashboards".to_string(), "grafana".to_string()); + let label_selector = LabelSelector { match_labels: label.clone(), match_expressions: vec![], }; - debug!("getting client"); + let client = self.k8s_client().await?; - info!("creating grafanas crd"); let grafana = self.build_grafana(ns, &label); client.apply(&grafana, Some(ns)).await?; - + //TODO change this to a ensure ready or something better than just a timeout client .wait_until_deployment_ready( "grafana-grafana-deployment".to_string(), @@ -175,16 +162,25 @@ impl Grafana for K8sAnywhereTopology { .await?; let sa_name = "grafana-grafana-sa"; - let token_secret_name = "grafana-sa-token-secret"; - // let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns); - // - // client.apply(&sa_token_secret, Some(ns)).await?; + let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns); - let secret = self.build_token_secret(token_secret_name, ns).await; - client.apply(&secret, Some(ns)).await?; - let token_request_status = self.create_service_account_token(sa_name, ns).await?; + client.apply(&sa_token_secret, Some(ns)).await?; + let secret_gvk = GroupVersionKind { + group: "".to_string(), + version: "v1".to_string(), + kind: "Secret".to_string(), + }; + + let secret = client + .get_resource_json_value(token_secret_name, Some(ns), &secret_gvk) + .await?; + + let token = format!( + "Bearer {}", + self.extract_and_normalize_token(&secret).unwrap() + ); debug!("creating grafana clusterrole binding"); @@ -195,8 +191,6 @@ impl Grafana for K8sAnywhereTopology { debug!("creating grafana datasource crd"); - // let token_str = format!("Bearer {}", token.token); - let thanos_url = format!( "https://{}", self.get_domain("thanos-querier-openshift-monitoring") @@ -209,7 +203,7 @@ impl Grafana for K8sAnywhereTopology { ns, &label_selector, &thanos_url, - &token_request_status.token, // Pass the secret name here + &token, ); client.apply(&thanos_openshift_datasource, Some(ns)).await?; @@ -398,8 +392,21 @@ impl K8sAnywhereTopology { .clone() } - pub fn build_service_account(&self, name: &str, namespace: &str) -> ServiceAccount { - build_prom_service_account(name.to_string(), namespace.to_string()) + fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option { + let token_b64 = secret + .data + .get("token") + .or_else(|| secret.data.get("data").and_then(|d| d.get("token"))) + .and_then(|v| v.as_str())?; + + let bytes = general_purpose::STANDARD.decode(token_b64).ok()?; + + let s = String::from_utf8(bytes).ok()?; + + let cleaned = s + .trim_matches(|c: char| c.is_whitespace() || c == '\0') + .to_string(); + Some(cleaned) } pub fn build_cluster_rolebinding( @@ -451,69 +458,13 @@ impl K8sAnywhereTopology { } } - pub fn get_token_request(&self, ns: &str) -> TokenRequest { - debug!("building token request"); - TokenRequest { - metadata: ObjectMeta { - namespace: Some(ns.to_string()), - ..Default::default() - }, - spec: TokenRequestSpec { - audiences: vec!["https://kubernetes.default.svc".to_string()], - expiration_seconds: Some(3600), - bound_object_ref: Some(BoundObjectReference { - kind: Some("Secret".to_string()), - name: Some("grafana-sa-token-secret".to_string()), - ..Default::default() - }), - }, - ..Default::default() - } - } - - pub async fn create_service_account_token( - &self, - service_account_name: &str, - ns: &str, - ) -> Result { - debug!("creating service account token"); - let token_request = self.get_token_request(ns); - let client = self.k8s_client().await?; - let pp = PostParams::default(); - let token_requests_api = client.service_account_api(ns).await; - - let data = serde_json::to_vec(&token_request).unwrap(); - - let created_token_request = token_requests_api - .create_subresource::("token", service_account_name, &pp, data) - .await?; - - let status = created_token_request - .status - .ok_or_else(|| PreparationError::new("missing token request status".to_string()))?; - - Ok(status) - } - - pub async fn build_token_secret(&self, secret_name: &str, ns: &str) -> Secret { - Secret { - metadata: ObjectMeta { - name: Some(secret_name.into()), - namespace: Some(ns.into()), - ..Default::default() - }, - string_data: None, - ..Default::default() - } - } - fn build_grafana_datasource( &self, name: &str, ns: &str, label_selector: &LabelSelector, url: &str, - token: &str, // Pass in the secret name + token: &str, ) -> GrafanaDatasource { let mut json_data = BTreeMap::new(); json_data.insert("timeInterval".to_string(), "5s".to_string()); diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index c99adc1..386890e 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -105,7 +105,7 @@ pub struct GrafanaDashboardSpec { #[serde(default, skip_serializing_if = "Option::is_none")] pub datasources: Option>, - + pub instance_selector: LabelSelector, #[serde(default, skip_serializing_if = "Option::is_none")] @@ -156,7 +156,6 @@ pub struct GrafanaDatasourceSpec { pub values_from: Option>, } - #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct GrafanaValueFrom { diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index 7873235..d7dca5e 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -11,7 +11,9 @@ use std::process::Command; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{ - Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, GrafanaSpec, GrafanaValueFrom, GrafanaValueSource + Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, + GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, GrafanaSpec, + GrafanaValueFrom, GrafanaValueSource, }; use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ PrometheusRule, PrometheusRuleSpec, RuleGroup, From ce91ee01685c48b81e46440e171440c92723577b Mon Sep 17 00:00:00 2001 From: Willem Date: Mon, 20 Oct 2025 15:31:06 -0400 Subject: [PATCH 09/51] fix: removed dead code, mapped error from grafana operator to preparation error rather than ignoring it, modified k8sprometheus score to unwrap_or_default() service monitors --- harmony/src/domain/topology/k8s_anywhere.rs | 24 +++++++------------ .../monitoring/grafana/helm/helm_grafana.rs | 3 ++- .../k8s_prometheus_alerting_score.rs | 2 -- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index e45b65f..cf56333 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -585,21 +585,12 @@ impl K8sAnywhereTopology { receivers: Option>>>, service_monitors: Option>, ) -> K8sPrometheusCRDAlertingScore { - if let Some(sm) = service_monitors { - return K8sPrometheusCRDAlertingScore { - sender, - receivers: receivers.unwrap_or_default(), - service_monitors: sm, - prometheus_rules: vec![], - }; - } else { - return K8sPrometheusCRDAlertingScore { - sender, - receivers: receivers.unwrap_or_default(), - service_monitors: vec![], - prometheus_rules: vec![], - }; - } + return K8sPrometheusCRDAlertingScore { + sender, + receivers: receivers.unwrap_or_default(), + service_monitors: service_monitors.unwrap_or_default(), + prometheus_rules: vec![], + }; } async fn openshift_ingress_operator_available(&self) -> Result<(), PreparationError> { @@ -882,7 +873,8 @@ impl K8sAnywhereTopology { } let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope) .interpret(inventory, self) - .await; + .await + .map_err(|e| PreparationError::new(e.to_string())); Ok(PreparationOutcome::Success { details: format!( "Successfully installed grafana operator in ns {}", diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs index 2965ada..c9ccacb 100644 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs @@ -1,3 +1,4 @@ +use harmony_macros::hurl; use non_blank_string_rs::NonBlankString; use std::{collections::HashMap, str::FromStr}; @@ -20,7 +21,7 @@ pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartSco install_only: true, repository: Some(HelmRepository::new( "grafana".to_string(), - url::Url::parse("https://grafana.github.io/helm-charts").unwrap(), + hurl!("https://grafana.github.io/helm-charts"), true, )), } diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index d7dca5e..7093ee8 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -467,8 +467,6 @@ impl K8sPrometheusCRDAlertingInterpret { match_labels: label.clone(), match_expressions: vec![], }; - // let mut json_data = BTreeMap::new(); - // json_data.insert("timeInterval".to_string(), "5s".to_string()); let namespace = self.sender.namespace.clone(); let json_data = GrafanaDatasourceJsonData { time_interval: Some("5s".to_string()), From 8126b233d8eed3fb6260c4d463585e9308de9422 Mon Sep 17 00:00:00 2001 From: Willem Date: Wed, 22 Oct 2025 11:27:28 -0400 Subject: [PATCH 10/51] feat: implementation for opnsense os-node_exporter --- harmony/src/domain/topology/mod.rs | 1 + harmony/src/infra/opnsense/mod.rs | 1 + harmony/src/infra/opnsense/node_exporter.rs | 44 ++++++++++++ harmony/src/modules/opnsense/mod.rs | 1 + harmony/src/modules/opnsense/node_exporter.rs | 70 +++++++++++++++++++ opnsense-config-xml/src/data/opnsense.rs | 20 +++++- opnsense-config/src/config/config.rs | 7 +- opnsense-config/src/modules/mod.rs | 1 + opnsense-config/src/modules/node_exporter.rs | 55 +++++++++++++++ 9 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 harmony/src/infra/opnsense/node_exporter.rs create mode 100644 harmony/src/modules/opnsense/node_exporter.rs create mode 100644 opnsense-config/src/modules/node_exporter.rs diff --git a/harmony/src/domain/topology/mod.rs b/harmony/src/domain/topology/mod.rs index 85e57d7..08c1c15 100644 --- a/harmony/src/domain/topology/mod.rs +++ b/harmony/src/domain/topology/mod.rs @@ -1,5 +1,6 @@ mod ha_cluster; pub mod ingress; +pub mod node_exporter; use harmony_types::net::IpAddress; mod host_binding; mod http; diff --git a/harmony/src/infra/opnsense/mod.rs b/harmony/src/infra/opnsense/mod.rs index 3878cfc..102d2b6 100644 --- a/harmony/src/infra/opnsense/mod.rs +++ b/harmony/src/infra/opnsense/mod.rs @@ -4,6 +4,7 @@ mod firewall; mod http; mod load_balancer; mod management; +pub mod node_exporter; mod tftp; use std::sync::Arc; diff --git a/harmony/src/infra/opnsense/node_exporter.rs b/harmony/src/infra/opnsense/node_exporter.rs new file mode 100644 index 0000000..2c27b26 --- /dev/null +++ b/harmony/src/infra/opnsense/node_exporter.rs @@ -0,0 +1,44 @@ +use async_trait::async_trait; +use log::debug; + +use crate::{ + executors::ExecutorError, infra::opnsense::OPNSenseFirewall, + topology::node_exporter::NodeExporter, +}; + +#[async_trait] +impl NodeExporter for OPNSenseFirewall { + async fn ensure_initialized(&self) -> Result<(), ExecutorError> { + let mut config = self.opnsense_config.write().await; + let node_exporter = config.node_exporter(); + if let Some(config) = node_exporter.get_full_config() { + debug!( + "Node exporter available in opnsense config, assuming it is already installed. {config:?}" + ); + } else { + config + .install_package("os-node_exporter") + .await + .map_err(|e| { + ExecutorError::UnexpectedError(format!("Executor failed when trying to install os-node_exporter package with error {e:?}" + )) + })?; + } + + config.node_exporter().enable(true); + Ok(()) + } + async fn commit_config(&self) -> Result<(), ExecutorError> { + OPNSenseFirewall::commit_config(self).await + } + + async fn reload_restart(&self) -> Result<(), ExecutorError> { + self.opnsense_config + .write() + .await + .node_exporter() + .reload_restart() + .await + .map_err(|e| ExecutorError::UnexpectedError(e.to_string())) + } +} diff --git a/harmony/src/modules/opnsense/mod.rs b/harmony/src/modules/opnsense/mod.rs index 28b52cf..8988205 100644 --- a/harmony/src/modules/opnsense/mod.rs +++ b/harmony/src/modules/opnsense/mod.rs @@ -1,3 +1,4 @@ +pub mod node_exporter; mod shell; mod upgrade; pub use shell::*; diff --git a/harmony/src/modules/opnsense/node_exporter.rs b/harmony/src/modules/opnsense/node_exporter.rs new file mode 100644 index 0000000..d17f67a --- /dev/null +++ b/harmony/src/modules/opnsense/node_exporter.rs @@ -0,0 +1,70 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use log::info; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + score::Score, + topology::{Topology, node_exporter::NodeExporter}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct NodeExporterScore {} + +impl Score for NodeExporterScore { + fn name(&self) -> String { + "NodeExporterScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(NodeExporterInterpret {}) + } +} + +#[derive(Debug)] +pub struct NodeExporterInterpret {} + +#[async_trait] +impl Interpret for NodeExporterInterpret { + async fn execute( + &self, + _inventory: &Inventory, + node_exporter: &T, + ) -> Result { + info!( + "Making sure node exporter is initiailized: {:?}", + node_exporter.ensure_initialized().await? + ); + + info!("Applying Node Exporter configuration"); + + node_exporter.commit_config().await?; + + info!("Reloading and restarting Node Exporter"); + + node_exporter.reload_restart().await?; + + Ok(Outcome::success(format!( + "NodeExporter successfully configured" + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("NodeExporter") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/opnsense-config-xml/src/data/opnsense.rs b/opnsense-config-xml/src/data/opnsense.rs index fa5f985..4b384d4 100644 --- a/opnsense-config-xml/src/data/opnsense.rs +++ b/opnsense-config-xml/src/data/opnsense.rs @@ -433,7 +433,7 @@ pub struct OPNsenseXmlSection { #[yaserde(rename = "Interfaces")] pub interfaces: Option, #[yaserde(rename = "NodeExporter")] - pub node_exporter: Option, + pub node_exporter: Option, #[yaserde(rename = "Kea")] pub kea: Option, pub monit: Option, @@ -1595,3 +1595,21 @@ pub struct Ifgroups { #[yaserde(attribute = true)] pub version: String, } + +#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)] +pub struct NodeExporter { + pub enabled: u8, + pub listenaddress: Option, + pub listenport: u16, + pub cpu: u8, + pub exec: u8, + pub filesystem: u8, + pub loadavg: u8, + pub meminfo: u8, + pub netdev: u8, + pub time: u8, + pub devstat: u8, + pub interrupts: u8, + pub ntp: u8, + pub zfs: u8, +} diff --git a/opnsense-config/src/config/config.rs b/opnsense-config/src/config/config.rs index c2d0f60..30240e4 100644 --- a/opnsense-config/src/config/config.rs +++ b/opnsense-config/src/config/config.rs @@ -5,7 +5,8 @@ use crate::{ error::Error, modules::{ caddy::CaddyConfig, dhcp_legacy::DhcpConfigLegacyISC, dns::UnboundDnsConfig, - dnsmasq::DhcpConfigDnsMasq, load_balancer::LoadBalancerConfig, tftp::TftpConfig, + dnsmasq::DhcpConfigDnsMasq, load_balancer::LoadBalancerConfig, + node_exporter::NodeExporterConfig, tftp::TftpConfig, }, }; use log::{debug, info, trace, warn}; @@ -71,6 +72,10 @@ impl Config { LoadBalancerConfig::new(&mut self.opnsense, self.shell.clone()) } + pub fn node_exporter(&mut self) -> NodeExporterConfig<'_> { + NodeExporterConfig::new(&mut self.opnsense, self.shell.clone()) + } + pub async fn upload_files(&self, source: &str, destination: &str) -> Result { self.shell.upload_folder(source, destination).await } diff --git a/opnsense-config/src/modules/mod.rs b/opnsense-config/src/modules/mod.rs index 3448075..eec16a2 100644 --- a/opnsense-config/src/modules/mod.rs +++ b/opnsense-config/src/modules/mod.rs @@ -4,4 +4,5 @@ pub mod dhcp_legacy; pub mod dns; pub mod dnsmasq; pub mod load_balancer; +pub mod node_exporter; pub mod tftp; diff --git a/opnsense-config/src/modules/node_exporter.rs b/opnsense-config/src/modules/node_exporter.rs new file mode 100644 index 0000000..9a44876 --- /dev/null +++ b/opnsense-config/src/modules/node_exporter.rs @@ -0,0 +1,55 @@ +use std::sync::Arc; + +use opnsense_config_xml::{NodeExporter, OPNsense}; + +use crate::{config::OPNsenseShell, Error}; + +pub struct NodeExporterConfig<'a> { + opnsense: &'a mut OPNsense, + opnsense_shell: Arc, +} + +impl<'a> NodeExporterConfig<'a> { + pub fn new(opnsense: &'a mut OPNsense, opnsense_shell: Arc) -> Self { + Self { + opnsense, + opnsense_shell, + } + } + + pub fn get_full_config(&self) -> &Option { + &self.opnsense.opnsense.node_exporter + } + fn with_node_exporter(&mut self, f: F) -> R + where + F: FnOnce(&mut NodeExporter) -> R, + { + match &mut self.opnsense.opnsense.node_exporter.as_mut() { + Some(node_exporter) => f(node_exporter), + None => unimplemented!( + " + node exporter is not yet installed" + ), + } + } + + pub fn enable(&mut self, enabled: bool) { + self.with_node_exporter(|node_exporter| node_exporter.enabled = enabled as u8) + } + + pub async fn reload_restart(&self) -> Result<(), Error> { + self.opnsense_shell + .exec("configctl node_exporter stop") + .await?; + self.opnsense_shell + .exec("configctl template reload OPNsense/NodeExporter") + .await?; + self.opnsense_shell + .exec("configctl node_exporter configtest") + .await?; + self.opnsense_shell + .exec("configctl node_exporter start") + .await?; + Ok(()) + } +} From 5af13800b7774311fd73efd9682371dcb4373aa1 Mon Sep 17 00:00:00 2001 From: Willem Date: Wed, 22 Oct 2025 11:51:22 -0400 Subject: [PATCH 11/51] fix: removed unimplemnted marco and returned Err instead some formatting error --- opnsense-config/src/modules/node_exporter.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/opnsense-config/src/modules/node_exporter.rs b/opnsense-config/src/modules/node_exporter.rs index 9a44876..fd7ee5c 100644 --- a/opnsense-config/src/modules/node_exporter.rs +++ b/opnsense-config/src/modules/node_exporter.rs @@ -20,21 +20,20 @@ impl<'a> NodeExporterConfig<'a> { pub fn get_full_config(&self) -> &Option { &self.opnsense.opnsense.node_exporter } - fn with_node_exporter(&mut self, f: F) -> R + + fn with_node_exporter(&mut self, f: F) -> Result where F: FnOnce(&mut NodeExporter) -> R, { match &mut self.opnsense.opnsense.node_exporter.as_mut() { - Some(node_exporter) => f(node_exporter), - None => unimplemented!( - " - node exporter is not yet installed" - ), + Some(node_exporter) => Ok(f(node_exporter)), + None => Err("node exporter is not yet installed"), } } - pub fn enable(&mut self, enabled: bool) { + pub fn enable(&mut self, enabled: bool) -> Result<(), &'static str> { self.with_node_exporter(|node_exporter| node_exporter.enabled = enabled as u8) + .map(|_| ()) } pub async fn reload_restart(&self) -> Result<(), Error> { From 5ab58f025330488e60f89d4a40976815059ce24a Mon Sep 17 00:00:00 2001 From: Willem Date: Wed, 22 Oct 2025 14:39:12 -0400 Subject: [PATCH 12/51] fix: added impl node exporter for hacluster topology and dummy infra --- examples/nanodc/src/main.rs | 4 +- examples/okd_installation/src/topology.rs | 1 + examples/okd_pxe/src/topology.rs | 1 + examples/opnsense/src/main.rs | 1 + examples/opnsense_node_exporter/Cargo.toml | 20 ++++ examples/opnsense_node_exporter/src/main.rs | 110 +++++++++++++++++++ harmony/src/domain/topology/ha_cluster.rs | 36 +++++- harmony/src/domain/topology/node_exporter.rs | 17 +++ 8 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 examples/opnsense_node_exporter/Cargo.toml create mode 100644 examples/opnsense_node_exporter/src/main.rs create mode 100644 harmony/src/domain/topology/node_exporter.rs diff --git a/examples/nanodc/src/main.rs b/examples/nanodc/src/main.rs index 57574d2..95b16a6 100644 --- a/examples/nanodc/src/main.rs +++ b/examples/nanodc/src/main.rs @@ -39,8 +39,7 @@ async fn main() { let gateway_ipv4 = Ipv4Addr::new(192, 168, 33, 1); let gateway_ip = IpAddr::V4(gateway_ipv4); let topology = harmony::topology::HAClusterTopology { - domain_name: "ncd0.harmony.mcd".to_string(), // TODO this must be set manually correctly - // when setting up the opnsense firewall + domain_name: "ncd0.harmony.mcd".to_string(), router: Arc::new(UnmanagedRouter::new( gateway_ip, Ipv4Cidr::new(lan_subnet, 24).unwrap(), @@ -84,6 +83,7 @@ async fn main() { }, ], switch: vec![], + node_exporter: opnsense.clone(), }; let inventory = Inventory { diff --git a/examples/okd_installation/src/topology.rs b/examples/okd_installation/src/topology.rs index 31062f5..4df6ab5 100644 --- a/examples/okd_installation/src/topology.rs +++ b/examples/okd_installation/src/topology.rs @@ -59,6 +59,7 @@ pub async fn get_topology() -> HAClusterTopology { }, workers: vec![], switch: vec![], + node_exporter: opnsense.clone(), } } diff --git a/examples/okd_pxe/src/topology.rs b/examples/okd_pxe/src/topology.rs index 707969a..63e3613 100644 --- a/examples/okd_pxe/src/topology.rs +++ b/examples/okd_pxe/src/topology.rs @@ -53,6 +53,7 @@ pub async fn get_topology() -> HAClusterTopology { }, workers: vec![], switch: vec![], + node_exporter: opnsense.clone(), } } diff --git a/examples/opnsense/src/main.rs b/examples/opnsense/src/main.rs index fcfaf09..8f4039d 100644 --- a/examples/opnsense/src/main.rs +++ b/examples/opnsense/src/main.rs @@ -55,6 +55,7 @@ async fn main() { }, workers: vec![], switch: vec![], + node_exporter: opnsense.clone(), }; let inventory = Inventory { diff --git a/examples/opnsense_node_exporter/Cargo.toml b/examples/opnsense_node_exporter/Cargo.toml new file mode 100644 index 0000000..957bdd9 --- /dev/null +++ b/examples/opnsense_node_exporter/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "example-opnsense-node-exporter" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +harmony_secret = { path = "../../harmony_secret" } +harmony_secret_derive = { path = "../../harmony_secret_derive" } +cidr = { workspace = true } +tokio = { workspace = true } +harmony_macros = { path = "../../harmony_macros" } +log = { workspace = true } +env_logger = { workspace = true } +url = { workspace = true } +serde.workspace = true diff --git a/examples/opnsense_node_exporter/src/main.rs b/examples/opnsense_node_exporter/src/main.rs new file mode 100644 index 0000000..4f1219d --- /dev/null +++ b/examples/opnsense_node_exporter/src/main.rs @@ -0,0 +1,110 @@ +use std::{ + net::{IpAddr, Ipv4Addr}, + sync::Arc, +}; + +use cidr::Ipv4Cidr; +use harmony::{ + hardware::{HostCategory, Location, PhysicalHost, SwitchGroup}, + infra::opnsense::OPNSenseManagementInterface, + inventory::Inventory, + modules::opnsense::node_exporter::NodeExporterScore, + topology::{HAClusterTopology, LogicalHost, UnmanagedRouter}, +}; +use harmony_macros::{ip, ipv4, mac_address}; + +#[tokio::main] +async fn main() { + let firewall = harmony::topology::LogicalHost { + ip: ip!("192.168.33.1"), + name: String::from("fw0"), + }; + + let opnsense = Arc::new( + harmony::infra::opnsense::OPNSenseFirewall::new(firewall, None, "root", "opnsense").await, + ); + let lan_subnet = Ipv4Addr::new(192, 168, 33, 0); + let gateway_ipv4 = Ipv4Addr::new(192, 168, 33, 1); + let gateway_ip = IpAddr::V4(gateway_ipv4); + let topology = harmony::topology::HAClusterTopology { + domain_name: "ncd0.harmony.mcd".to_string(), + router: Arc::new(UnmanagedRouter::new( + gateway_ip, + Ipv4Cidr::new(lan_subnet, 24).unwrap(), + )), + load_balancer: opnsense.clone(), + firewall: opnsense.clone(), + tftp_server: opnsense.clone(), + http_server: opnsense.clone(), + dhcp_server: opnsense.clone(), + dns_server: opnsense.clone(), + control_plane: vec![ + LogicalHost { + ip: ip!("192.168.33.20"), + name: "cp0".to_string(), + }, + LogicalHost { + ip: ip!("192.168.33.21"), + name: "cp1".to_string(), + }, + LogicalHost { + ip: ip!("192.168.33.22"), + name: "cp2".to_string(), + }, + ], + bootstrap_host: LogicalHost { + ip: ip!("192.168.33.66"), + name: "bootstrap".to_string(), + }, + workers: vec![ + LogicalHost { + ip: ip!("192.168.33.30"), + name: "wk0".to_string(), + }, + LogicalHost { + ip: ip!("192.168.33.31"), + name: "wk1".to_string(), + }, + LogicalHost { + ip: ip!("192.168.33.32"), + name: "wk2".to_string(), + }, + ], + switch: vec![], + node_exporter: opnsense.clone(), + }; + + let inventory = Inventory { + location: Location::new("I am mobile".to_string(), "earth".to_string()), + switch: SwitchGroup::from([]), + firewall_mgmt: Box::new(OPNSenseManagementInterface::new()), + storage_host: vec![], + worker_host: vec![ + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:02:61:0F")), + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:02:61:26")), + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:02:61:70")), + ], + control_plane_host: vec![ + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:02:60:FA")), + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:02:61:1A")), + PhysicalHost::empty(HostCategory::Server) + .mac_address(mac_address!("C4:62:37:01:BC:68")), + ], + }; + + let node_exporter_score = NodeExporterScore {}; + + harmony_cli::run( + inventory, + topology, + vec![Box::new(node_exporter_score)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony/src/domain/topology/ha_cluster.rs b/harmony/src/domain/topology/ha_cluster.rs index 7be2725..a3e650d 100644 --- a/harmony/src/domain/topology/ha_cluster.rs +++ b/harmony/src/domain/topology/ha_cluster.rs @@ -11,7 +11,6 @@ use kube::api::ObjectMeta; use log::debug; use log::info; -use crate::data::FileContent; use crate::executors::ExecutorError; use crate::hardware::PhysicalHost; use crate::infra::brocade::BrocadeSwitchAuth; @@ -21,6 +20,7 @@ use crate::modules::okd::crd::{ nmstate::{self, NMState, NodeNetworkConfigurationPolicy, NodeNetworkConfigurationPolicySpec}, }; use crate::topology::PxeOptions; +use crate::{data::FileContent, topology::node_exporter::NodeExporter}; use super::{ DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig, @@ -43,6 +43,7 @@ pub struct HAClusterTopology { pub tftp_server: Arc, pub http_server: Arc, pub dns_server: Arc, + pub node_exporter: Arc, pub bootstrap_host: LogicalHost, pub control_plane: Vec, pub workers: Vec, @@ -333,6 +334,7 @@ impl HAClusterTopology { tftp_server: dummy_infra.clone(), http_server: dummy_infra.clone(), dns_server: dummy_infra.clone(), + node_exporter: dummy_infra.clone(), bootstrap_host: dummy_host, control_plane: vec![], workers: vec![], @@ -516,6 +518,23 @@ impl Switch for HAClusterTopology { self.configure_bond(host, &config).await?; self.configure_port_channel(host, &config).await } + + //TODO add snmp here +} + +#[async_trait] +impl NodeExporter for HAClusterTopology { + async fn ensure_initialized(&self) -> Result<(), ExecutorError> { + self.node_exporter.ensure_initialized().await + } + + async fn commit_config(&self) -> Result<(), ExecutorError> { + self.node_exporter.commit_config().await + } + + async fn reload_restart(&self) -> Result<(), ExecutorError> { + self.node_exporter.reload_restart().await + } } #[derive(Debug)] @@ -704,3 +723,18 @@ impl DnsServer for DummyInfra { unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA) } } + +#[async_trait] +impl NodeExporter for DummyInfra { + async fn ensure_initialized(&self) -> Result<(), ExecutorError> { + unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA) + } + + async fn commit_config(&self) -> Result<(), ExecutorError> { + unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA) + } + + async fn reload_restart(&self) -> Result<(), ExecutorError> { + unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA) + } +} diff --git a/harmony/src/domain/topology/node_exporter.rs b/harmony/src/domain/topology/node_exporter.rs new file mode 100644 index 0000000..88e3cc9 --- /dev/null +++ b/harmony/src/domain/topology/node_exporter.rs @@ -0,0 +1,17 @@ +use async_trait::async_trait; + +use crate::executors::ExecutorError; + +#[async_trait] +pub trait NodeExporter: Send + Sync { + async fn ensure_initialized(&self) -> Result<(), ExecutorError>; + async fn commit_config(&self) -> Result<(), ExecutorError>; + async fn reload_restart(&self) -> Result<(), ExecutorError>; +} + +//TODO complete this impl +impl std::fmt::Debug for dyn NodeExporter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("NodeExporter ",)) + } +} From 008b03f979561da8652ac32154aea89e9d642852 Mon Sep 17 00:00:00 2001 From: Willem Date: Thu, 23 Oct 2025 14:56:07 -0400 Subject: [PATCH 13/51] fix: changed documentation language to english --- docs/doc-clone-and-restore-coreos.md | 127 ++++++++++++++++++++ docs/doc-clone-et-restaure-disque-coreos.md | 117 ------------------ 2 files changed, 127 insertions(+), 117 deletions(-) create mode 100644 docs/doc-clone-and-restore-coreos.md delete mode 100644 docs/doc-clone-et-restaure-disque-coreos.md diff --git a/docs/doc-clone-and-restore-coreos.md b/docs/doc-clone-and-restore-coreos.md new file mode 100644 index 0000000..9b392ed --- /dev/null +++ b/docs/doc-clone-and-restore-coreos.md @@ -0,0 +1,127 @@ +## Working procedure to clone and restore CoreOS disk from OKD Cluster + +### **Step 1 - take a backup** +``` +sudo dd if=/dev/old of=/dev/backup status=progress +``` + +### **Step 2 - clone beginning of old disk to new** +``` +sudo dd if=/dev/old of=/dev/backup status=progress count=1000Mib +``` + +### **Step 3 - verify and modify disk partitions** +list disk partitions +``` +sgdisk -p /dev/new +``` +if new disk is smaller than old disk and there is space on the xfs partition of the old disk, modify partitions of new disk +``` +gdisk /dev/new +``` +inside of gdisk commands +``` +-v -> verify table +-p -> print table +-d -> select partition to delete partition +-n -> recreate partition with same partition number as deleted partition +``` +For end sector, either specify the new end or just press Enter for maximum available +When asked about partition type, enter the same type code (it will show the old one) +``` +p - >to verify +w -> to write +``` +make xfs file system for new partition +``` +sudo mkfs.xfs -f /dev/new4 +``` + +### **Step 4 - copy old PARTUUID ** + +**careful here** +get old patuuid: +``` +sgdisk -i /dev/old_disk # Note the "Partition unique GUID" +``` +get labels +``` +sgdisk -p /dev/old_disk # Shows partition names in the table + +blkid /dev/old_disk* # Shows PARTUUIDs and labels for all partitions +``` +set it on new disk +``` +sgdisk -u : /dev/sdc +``` +partition name: +``` +sgdisk -c :"" /dev/sdc +``` +verify all: +``` +lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/old_disk +``` + +### **Step 5 - Mount disks and copy files from old to new disk** + +mount files before copy: + +``` +mkdir -p /mnt/new +mkdir -p /mnt/old +mount /dev/old4 /mnt/old +mount /dev/new4 /mnt/new +``` +copy: +``` +rsync -aAXHv --numeric-ids /source/ /destination/ +``` + +### **Step 6 - Set correct UUID for new partition 4** +to set correct uuid for partition 4 +``` +blkid /dev/old4 +``` +``` +xfs_admin -U /dev/new_partition +``` +to set labels +get it +``` +sgdisk -i 4 /dev/sda | grep "Partition name" +``` +set it +``` +sgdisk -c 4:"" /dev/sdc + +or + +(check existing with xfs_admin -l /dev/old_partition) +Use xfs_admin -L