From d3d9bd614f81f953ef89c30e13ca69971abf18f3 Mon Sep 17 00:00:00 2001 From: Willem Date: Fri, 5 Sep 2025 14:38:08 -0400 Subject: [PATCH] wip: added a monitoring stack that works with openshift/okd. Okd needs to use the cluster observability operator in order to deploy namespaced prometheuses and alertmanagers --- Cargo.lock | 19 + .../rhob_application_monitoring/Cargo.toml | 17 + .../rhob_application_monitoring/src/main.rs | 51 ++ harmony/src/domain/interpret/mod.rs | 2 + harmony/src/domain/topology/k8s_anywhere.rs | 107 +++- .../src/modules/application/features/mod.rs | 1 + .../application/features/rhob_monitoring.rs | 109 ++++ .../alert_channel/discord_alert_channel.rs | 63 ++ .../alert_channel/webhook_receiver.rs | 74 ++- .../monitoring/application_monitoring/mod.rs | 1 + .../rhobs_application_monitoring_score.rs | 94 +++ .../monitoring/kube_prometheus/crd/mod.rs | 9 + .../crd/rhob_alertmanager_config.rs | 50 ++ .../kube_prometheus/crd/rhob_alertmanagers.rs | 52 ++ .../rhob_cluster_observability_operator.rs | 22 + .../kube_prometheus/crd/rhob_default_rules.rs | 26 + .../kube_prometheus/crd/rhob_grafana.rs | 153 +++++ .../crd/rhob_monitoring_stack.rs | 41 ++ .../crd/rhob_prometheus_rules.rs | 57 ++ .../kube_prometheus/crd/rhob_prometheuses.rs | 118 ++++ .../kube_prometheus/crd/rhob_role.rs | 62 ++ harmony/src/modules/prometheus/mod.rs | 1 + .../modules/prometheus/rhob_alerting_score.rs | 544 ++++++++++++++++++ 23 files changed, 1666 insertions(+), 7 deletions(-) create mode 100644 examples/rhob_application_monitoring/Cargo.toml create mode 100644 examples/rhob_application_monitoring/src/main.rs create mode 100644 harmony/src/modules/application/features/rhob_monitoring.rs create mode 100644 harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs create mode 100644 harmony/src/modules/prometheus/rhob_alerting_score.rs diff --git a/Cargo.lock b/Cargo.lock index 62d8aee..46cfa64 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4615,6 +4615,21 @@ dependencies = [ "subtle", ] +[[package]] +name = "rhob-application-monitoring" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "tokio", + "url", +] + [[package]] name = "ring" version = "0.17.14" @@ -6273,6 +6288,10 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "try_rust_webapp" +version = "0.1.0" + [[package]] name = "tui-logger" version = "0.14.5" diff --git a/examples/rhob_application_monitoring/Cargo.toml b/examples/rhob_application_monitoring/Cargo.toml new file mode 100644 index 0000000..9ee4eee --- /dev/null +++ b/examples/rhob_application_monitoring/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rhob-application-monitoring" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +harmony_macros = { path = "../../harmony_macros" } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } +url = { workspace = true } +base64.workspace = true diff --git a/examples/rhob_application_monitoring/src/main.rs b/examples/rhob_application_monitoring/src/main.rs new file mode 100644 index 0000000..ebd675f --- /dev/null +++ b/examples/rhob_application_monitoring/src/main.rs @@ -0,0 +1,51 @@ +use std::{path::PathBuf, sync::Arc}; + +use harmony::{ + inventory::Inventory, + modules::{ + application::{ + ApplicationScore, RustWebFramework, RustWebapp, + features::rhob_monitoring::RHOBMonitoring, + }, + monitoring::alert_channel::{ + discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver, + }, + }, + topology::K8sAnywhereTopology, +}; +use harmony_types::net::Url; + +#[tokio::main] +async fn main() { + let application = Arc::new(RustWebapp { + name: "test-rhob-monitoring".to_string(), + domain: Url::Url(url::Url::parse("hhtps://some-fake-url").unwrap()), + project_root: PathBuf::from("./webapp"), // Relative from 'harmony-path' param + framework: Some(RustWebFramework::Leptos), + }); + + let discord_receiver = DiscordWebhook { + name: "test-discord".to_string(), + url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), + }; + + let app = ApplicationScore { + features: vec![ + Box::new(RHOBMonitoring { + application: application.clone(), + alert_receiver: vec![Box::new(discord_receiver)], + }), + // TODO add backups, multisite ha, etc + ], + application, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(app)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index 71d2f61..fac18df 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -32,6 +32,7 @@ pub enum InterpretName { K8sPrometheusCrdAlerting, DiscoverInventoryAgent, CephClusterHealth, + RHOBAlerting, } impl std::fmt::Display for InterpretName { @@ -60,6 +61,7 @@ impl std::fmt::Display for InterpretName { InterpretName::K8sPrometheusCrdAlerting => f.write_str("K8sPrometheusCrdAlerting"), InterpretName::DiscoverInventoryAgent => f.write_str("DiscoverInventoryAgent"), InterpretName::CephClusterHealth => f.write_str("CephClusterHealth"), + InterpretName::RHOBAlerting => f.write_str("RHOBAlerting"), } } } diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index f81bef4..119ad13 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -14,10 +14,11 @@ use crate::{ monitoring::kube_prometheus::crd::{ crd_alertmanager_config::CRDPrometheus, prometheus_operator::prometheus_operator_helm_chart_score, + rhob_alertmanager_config::RHOBObservability, }, prometheus::{ k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, - prometheus::PrometheusApplicationMonitoring, + prometheus::PrometheusApplicationMonitoring, rhob_alerting_score::RHOBAlertingScore, }, }, score::Score, @@ -108,6 +109,43 @@ impl PrometheusApplicationMonitoring for K8sAnywhereTopology { } } +#[async_trait] +impl PrometheusApplicationMonitoring for K8sAnywhereTopology { + async fn install_prometheus( + &self, + sender: &RHOBObservability, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let po_result = self.ensure_cluster_observability_operator(sender).await?; + + if po_result == PreparationOutcome::Noop { + debug!("Skipping Prometheus CR installation due to missing operator."); + return Ok(po_result); + } + + let result = self + .get_cluster_observability_operator_prometheus_application_score( + sender.clone(), + receivers, + ) + .await + .interpret(inventory, self) + .await; + + match result { + Ok(outcome) => match outcome.status { + InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { + details: outcome.message, + }), + InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), + _ => Err(PreparationError::new(outcome.message)), + }, + Err(err) => Err(PreparationError::new(err.to_string())), + } + } +} + impl Serialize for K8sAnywhereTopology { fn serialize(&self, _serializer: S) -> Result where @@ -134,6 +172,19 @@ impl K8sAnywhereTopology { } } + async fn get_cluster_observability_operator_prometheus_application_score( + &self, + sender: RHOBObservability, + receivers: Option>>>, + ) -> RHOBAlertingScore { + RHOBAlertingScore { + sender, + receivers: receivers.unwrap_or_default(), + service_monitors: vec![], + prometheus_rules: vec![], + } + } + async fn get_k8s_prometheus_application_score( &self, sender: CRDPrometheus, @@ -286,6 +337,60 @@ impl K8sAnywhereTopology { } } + async fn ensure_cluster_observability_operator( + &self, + sender: &RHOBObservability, + ) -> Result { + let status = Command::new("sh") + .args(["-c", "kubectl get crd -A | grep -i rhobs"]) + .status() + .map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?; + + if !status.success() { + if let Some(Some(k8s_state)) = self.k8s_state.get() { + match k8s_state.source { + K8sSource::LocalK3d => { + debug!("installing cluster observability operator"); + todo!(); + let op_score = + prometheus_operator_helm_chart_score(sender.namespace.clone()); + let result = op_score.interpret(&Inventory::empty(), self).await; + + return match result { + Ok(outcome) => match outcome.status { + InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { + details: "installed cluster observability operator".into(), + }), + InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), + _ => Err(PreparationError::new( + "failed to install cluster observability operator (unknown error)".into(), + )), + }, + Err(err) => Err(PreparationError::new(err.to_string())), + }; + } + K8sSource::Kubeconfig => { + debug!( + "unable to install cluster observability operator, contact cluster admin" + ); + return Ok(PreparationOutcome::Noop); + } + } + } else { + warn!( + "Unable to detect k8s_state. Skipping Cluster Observability Operator install." + ); + return Ok(PreparationOutcome::Noop); + } + } + + debug!("Cluster Observability Operator is already present, skipping install"); + + Ok(PreparationOutcome::Success { + details: "cluster observability operator present in cluster".into(), + }) + } + async fn ensure_prometheus_operator( &self, sender: &CRDPrometheus, diff --git a/harmony/src/modules/application/features/mod.rs b/harmony/src/modules/application/features/mod.rs index ea979bd..93f6412 100644 --- a/harmony/src/modules/application/features/mod.rs +++ b/harmony/src/modules/application/features/mod.rs @@ -1,4 +1,5 @@ mod endpoint; +pub mod rhob_monitoring; pub use endpoint::*; mod monitoring; diff --git a/harmony/src/modules/application/features/rhob_monitoring.rs b/harmony/src/modules/application/features/rhob_monitoring.rs new file mode 100644 index 0000000..62a5323 --- /dev/null +++ b/harmony/src/modules/application/features/rhob_monitoring.rs @@ -0,0 +1,109 @@ +use std::sync::Arc; + +use crate::modules::application::{Application, ApplicationFeature}; +use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore; +use crate::modules::monitoring::application_monitoring::rhobs_application_monitoring_score::ApplicationRHOBMonitoringScore; + +use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; +use crate::topology::MultiTargetTopology; +use crate::{ + inventory::Inventory, + modules::monitoring::{ + alert_channel::webhook_receiver::WebhookReceiver, ntfy::ntfy::NtfyScore, + }, + score::Score, + topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager}, +}; +use crate::{ + modules::prometheus::prometheus::PrometheusApplicationMonitoring, + topology::oberservability::monitoring::AlertReceiver, +}; +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use harmony_types::net::Url; +use log::{debug, info}; + +#[derive(Debug, Clone)] +pub struct RHOBMonitoring { + pub application: Arc, + pub alert_receiver: Vec>>, +} + +#[async_trait] +impl< + T: Topology + + HelmCommand + + 'static + + TenantManager + + K8sclient + + MultiTargetTopology + + std::fmt::Debug + + PrometheusApplicationMonitoring, +> ApplicationFeature for RHOBMonitoring +{ + async fn ensure_installed(&self, topology: &T) -> Result<(), String> { + info!("Ensuring monitoring is available for application"); + let namespace = topology + .get_tenant_config() + .await + .map(|ns| ns.name.clone()) + .unwrap_or_else(|| self.application.name()); + + let mut alerting_score = ApplicationRHOBMonitoringScore { + sender: RHOBObservability { + namespace: namespace.clone(), + client: topology.k8s_client().await.unwrap(), + }, + application: self.application.clone(), + receivers: self.alert_receiver.clone(), + }; + let ntfy = NtfyScore { + namespace: namespace.clone(), + host: "ntfy.harmonydemo.apps.ncd0.harmony.mcd".to_string(), + }; + ntfy.interpret(&Inventory::empty(), topology) + .await + .map_err(|e| e.to_string())?; + + let ntfy_default_auth_username = "harmony"; + let ntfy_default_auth_password = "harmony"; + let ntfy_default_auth_header = format!( + "Basic {}", + general_purpose::STANDARD.encode(format!( + "{ntfy_default_auth_username}:{ntfy_default_auth_password}" + )) + ); + + debug!("ntfy_default_auth_header: {ntfy_default_auth_header}"); + + let ntfy_default_auth_param = general_purpose::STANDARD + .encode(ntfy_default_auth_header) + .replace("=", ""); + + debug!("ntfy_default_auth_param: {ntfy_default_auth_param}"); + + let ntfy_receiver = WebhookReceiver { + name: "ntfy-webhook".to_string(), + url: Url::Url( + url::Url::parse( + format!( + "http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}", + namespace.clone() + ) + .as_str(), + ) + .unwrap(), + ), + }; + + alerting_score.receivers.push(Box::new(ntfy_receiver)); + alerting_score + .interpret(&Inventory::empty(), topology) + .await + .map_err(|e| e.to_string())?; + Ok(()) + } + fn name(&self) -> String { + "Monitoring".to_string() + } +} diff --git a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs index caab4d1..2cdc7ec 100644 --- a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs +++ b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs @@ -4,6 +4,7 @@ use std::collections::BTreeMap; use async_trait::async_trait; use k8s_openapi::api::core::v1::Secret; use kube::api::ObjectMeta; +use log::debug; use serde::Serialize; use serde_json::json; use serde_yaml::{Mapping, Value}; @@ -11,6 +12,7 @@ use serde_yaml::{Mapping, Value}; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{ AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, }; +use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; use crate::{ interpret::{InterpretError, Outcome}, modules::monitoring::{ @@ -30,6 +32,67 @@ pub struct DiscordWebhook { pub url: Url, } +#[async_trait] +impl AlertReceiver for DiscordWebhook { + async fn install(&self, sender: &RHOBObservability) -> Result { + let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec { + data: json!({ + "route": { + "receiver": self.name, + }, + "receivers": [ + { + "name": self.name, + "webhookConfigs": [ + { + "url": self.url, + } + ] + } + ] + }), + }; + + let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(sender.namespace.clone()), + ..Default::default() + }, + spec, + }; + debug!( + "alert manager configs: \n{:#?}", + alertmanager_configs.clone() + ); + + sender + .client + .apply(&alertmanager_configs, Some(&sender.namespace)) + .await?; + Ok(Outcome::success(format!( + "installed rhob-alertmanagerconfigs for {}", + self.name + ))) + } + + fn name(&self) -> String { + "webhook-receiver".to_string() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + #[async_trait] impl AlertReceiver for DiscordWebhook { async fn install(&self, sender: &CRDPrometheus) -> Result { diff --git a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs index 51e63b6..52124ff 100644 --- a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs +++ b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs @@ -11,8 +11,8 @@ use crate::{ interpret::{InterpretError, Outcome}, modules::monitoring::{ kube_prometheus::{ - crd::crd_alertmanager_config::{ - AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, + crd::{ + crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability, }, prometheus::{KubePrometheus, KubePrometheusReceiver}, types::{AlertChannelConfig, AlertManagerChannelConfig}, @@ -30,9 +30,9 @@ pub struct WebhookReceiver { } #[async_trait] -impl AlertReceiver for WebhookReceiver { - async fn install(&self, sender: &CRDPrometheus) -> Result { - let spec = AlertmanagerConfigSpec { +impl AlertReceiver for WebhookReceiver { + async fn install(&self, sender: &RHOBObservability) -> Result { + let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec { data: json!({ "route": { "receiver": self.name, @@ -50,7 +50,68 @@ impl AlertReceiver for WebhookReceiver { }), }; - let alertmanager_configs = AlertmanagerConfig { + let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(sender.namespace.clone()), + ..Default::default() + }, + spec, + }; + debug!( + "alert manager configs: \n{:#?}", + alertmanager_configs.clone() + ); + + sender + .client + .apply(&alertmanager_configs, Some(&sender.namespace)) + .await?; + Ok(Outcome::success(format!( + "installed rhob-alertmanagerconfigs for {}", + self.name + ))) + } + + fn name(&self) -> String { + "webhook-receiver".to_string() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[async_trait] +impl AlertReceiver for WebhookReceiver { + async fn install(&self, sender: &CRDPrometheus) -> Result { + let spec = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfigSpec { + data: json!({ + "route": { + "receiver": self.name, + }, + "receivers": [ + { + "name": self.name, + "webhookConfigs": [ + { + "url": self.url, + } + ] + } + ] + }), + }; + + let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfig { metadata: ObjectMeta { name: Some(self.name.clone()), labels: Some(std::collections::BTreeMap::from([( @@ -115,6 +176,7 @@ impl PrometheusReceiver for WebhookReceiver { self.get_config().await } } + #[async_trait] impl AlertReceiver for WebhookReceiver { async fn install(&self, sender: &KubePrometheus) -> Result { diff --git a/harmony/src/modules/monitoring/application_monitoring/mod.rs b/harmony/src/modules/monitoring/application_monitoring/mod.rs index c243cd7..5d12f78 100644 --- a/harmony/src/modules/monitoring/application_monitoring/mod.rs +++ b/harmony/src/modules/monitoring/application_monitoring/mod.rs @@ -1 +1,2 @@ pub mod application_monitoring_score; +pub mod rhobs_application_monitoring_score; diff --git a/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs new file mode 100644 index 0000000..17e42c3 --- /dev/null +++ b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs @@ -0,0 +1,94 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::{ + application::Application, + monitoring::kube_prometheus::crd::{ + crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability, + }, + prometheus::prometheus::PrometheusApplicationMonitoring, + }, + score::Score, + topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver}, +}; +use harmony_types::id::Id; + +#[derive(Debug, Clone, Serialize)] +pub struct ApplicationRHOBMonitoringScore { + pub sender: RHOBObservability, + pub application: Arc, + pub receivers: Vec>>, +} + +impl> Score + for ApplicationRHOBMonitoringScore +{ + fn create_interpret(&self) -> Box> { + Box::new(ApplicationRHOBMonitoringInterpret { + score: self.clone(), + }) + } + + fn name(&self) -> String { + format!( + "{} monitoring [ApplicationRHOBMonitoringScore]", + self.application.name() + ) + } +} + +#[derive(Debug)] +pub struct ApplicationRHOBMonitoringInterpret { + score: ApplicationRHOBMonitoringScore, +} + +#[async_trait] +impl> Interpret + for ApplicationRHOBMonitoringInterpret +{ + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let result = topology + .install_prometheus( + &self.score.sender, + inventory, + Some(self.score.receivers.clone()), + ) + .await; + + match result { + Ok(outcome) => match outcome { + PreparationOutcome::Success { details: _ } => { + Ok(Outcome::success("Prometheus installed".into())) + } + PreparationOutcome::Noop => Ok(Outcome::noop()), + }, + Err(err) => Err(InterpretError::from(err)), + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::ApplicationMonitoring + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs index 236a2de..81476f4 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs @@ -7,5 +7,14 @@ pub mod crd_prometheuses; pub mod grafana_default_dashboard; pub mod grafana_operator; pub mod prometheus_operator; +pub mod rhob_alertmanager_config; +pub mod rhob_alertmanagers; +pub mod rhob_cluster_observability_operator; +pub mod rhob_default_rules; +pub mod rhob_grafana; +pub mod rhob_monitoring_stack; +pub mod rhob_prometheus_rules; +pub mod rhob_prometheuses; +pub mod rhob_role; pub mod role; pub mod service_monitor; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs new file mode 100644 index 0000000..c37eb80 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs @@ -0,0 +1,50 @@ +use std::sync::Arc; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::topology::{ + k8s::K8sClient, + oberservability::monitoring::{AlertReceiver, AlertSender}, +}; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.rhobs", + version = "v1alpha", + kind = "AlertmanagerConfig", + plural = "alertmanagerconfigs", + namespaced +)] +pub struct AlertmanagerConfigSpec { + #[serde(flatten)] + pub data: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize)] +pub struct RHOBObservability { + pub namespace: String, + pub client: Arc, +} + +impl AlertSender for RHOBObservability { + fn name(&self) -> String { + "RHOBAlertManager".to_string() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs new file mode 100644 index 0000000..4435467 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs @@ -0,0 +1,52 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::crd_prometheuses::LabelSelector; + +/// Rust CRD for `Alertmanager` from Prometheus Operator +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.rhobs", + version = "v1", + kind = "Alertmanager", + plural = "alertmanagers", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct AlertmanagerSpec { + /// Number of replicas for HA + pub replicas: i32, + + /// Selectors for AlertmanagerConfig CRDs + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alertmanager_config_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alertmanager_config_namespace_selector: Option, + + /// Optional pod template metadata (annotations, labels) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_metadata: Option, + + /// Optional topology spread settings + #[serde(default, skip_serializing_if = "Option::is_none")] + pub version: Option, +} + +impl Default for AlertmanagerSpec { + fn default() -> Self { + AlertmanagerSpec { + replicas: 1, + + // Match all AlertmanagerConfigs in the same namespace + alertmanager_config_namespace_selector: None, + + // Empty selector matches all AlertmanagerConfigs in that namespace + alertmanager_config_selector: Some(LabelSelector::default()), + + pod_metadata: None, + version: None, + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs new file mode 100644 index 0000000..bc7ad9f --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs @@ -0,0 +1,22 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; +//TODO package chart or something for COO okd +pub fn rhob_cluster_observability_operator() -> HelmChartScore { + HelmChartScore { + namespace: None, + release_name: NonBlankString::from_str("").unwrap(), + chart_name: NonBlankString::from_str( + "oci://hub.nationtech.io/harmony/nt-prometheus-operator", + ) + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: None, + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs new file mode 100644 index 0000000..459bd3f --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs @@ -0,0 +1,26 @@ +use crate::modules::{ + monitoring::kube_prometheus::crd::rhob_prometheus_rules::Rule, + prometheus::alerts::k8s::{ + deployment::alert_deployment_unavailable, + pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, + pvc::high_pvc_fill_rate_over_two_days, + service::alert_service_down, + }, +}; + +pub fn build_default_application_rules() -> Vec { + let pod_failed: Rule = pod_failed().into(); + let container_restarting: Rule = alert_container_restarting().into(); + let pod_not_ready: Rule = alert_pod_not_ready().into(); + let service_down: Rule = alert_service_down().into(); + let deployment_unavailable: Rule = alert_deployment_unavailable().into(); + let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into(); + vec![ + pod_failed, + container_restarting, + pod_not_ready, + service_down, + deployment_unavailable, + high_pvc_fill_rate, + ] +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs new file mode 100644 index 0000000..65efab9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs @@ -0,0 +1,153 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "Grafana", + plural = "grafanas", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub config: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ingress: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub persistence: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resources: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub log: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub security: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaLogConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mode: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub level: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSecurityConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngress { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hosts: Option>, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaPersistence { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub storage_class_name: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub size: Option, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDashboard", + plural = "grafanadashboards", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDashboardSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resync_period: Option, + + pub instance_selector: LabelSelector, + + pub json: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDatasource", + plural = "grafanadatasources", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceSpec { + pub instance_selector: LabelSelector, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub allow_cross_namespace_import: Option, + + pub datasource: GrafanaDatasourceConfig, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceConfig { + pub access: String, + pub database: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub json_data: Option>, + pub name: String, + pub r#type: String, + pub url: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct ResourceRequirements { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub limits: BTreeMap, + + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub requests: BTreeMap, +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs new file mode 100644 index 0000000..bd542e9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs @@ -0,0 +1,41 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector; + +/// MonitoringStack CRD for monitoring.rhobs/v1alpha1 +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.rhobs", + version = "v1alpha1", + kind = "MonitoringStack", + plural = "monitoringstacks", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct MonitoringStackSpec { + /// Verbosity of logs (e.g. "debug", "info", "warn", "error"). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub log_level: Option, + + /// Retention period for Prometheus TSDB data (e.g. "1d"). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub retention: Option, + + /// Resource selector for workloads monitored by this stack. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resource_selector: Option, +} + +impl Default for MonitoringStackSpec { + fn default() -> Self { + MonitoringStackSpec { + log_level: Some("info".into()), + retention: Some("7d".into()), + resource_selector: Some(LabelSelector::default()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs new file mode 100644 index 0000000..1792104 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs @@ -0,0 +1,57 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] +#[kube( + group = "monitoring.rhobs.com", + version = "v1", + kind = "PrometheusRule", + plural = "prometheusrules", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusRuleSpec { + pub groups: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct RuleGroup { + pub name: String, + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct Rule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alert: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expr: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub for_: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option>, +} + +impl From for Rule { + fn from(value: PrometheusAlertRule) -> Self { + Rule { + alert: Some(value.alert), + expr: Some(value.expr), + for_: value.r#for, + labels: Some(value.labels.into_iter().collect::>()), + annotations: Some(value.annotations.into_iter().collect::>()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs new file mode 100644 index 0000000..18d3f57 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs @@ -0,0 +1,118 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::kube_prometheus::types::Operator; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.rhobs", + version = "v1", + kind = "Prometheus", + plural = "prometheuses", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alerting: Option, + + pub service_account_name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_namespace_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_discovery_role: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_namespace_selector: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct NamespaceSelector { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_names: Vec, +} + +/// Contains alerting configuration, specifically Alertmanager endpoints. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +pub struct PrometheusSpecAlerting { + #[serde(skip_serializing_if = "Option::is_none")] + pub alertmanagers: Option>, +} + +/// Represents an Alertmanager endpoint configuration used by Prometheus. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +pub struct AlertmanagerEndpoints { + /// Name of the Alertmanager Service. + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, + + /// Namespace of the Alertmanager Service. + #[serde(skip_serializing_if = "Option::is_none")] + pub namespace: Option, + + /// Port to access on the Alertmanager Service (e.g. "web"). + #[serde(skip_serializing_if = "Option::is_none")] + pub port: Option, + + /// Scheme to use for connecting (e.g. "http"). + #[serde(skip_serializing_if = "Option::is_none")] + pub scheme: Option, + // Other fields like `tls_config`, `path_prefix`, etc., can be added if needed. +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub match_labels: BTreeMap, + + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_expressions: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelectorRequirement { + pub key: String, + pub operator: Operator, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub values: Vec, +} + +impl Default for PrometheusSpec { + fn default() -> Self { + PrometheusSpec { + alerting: None, + + service_account_name: "prometheus".into(), + + // null means "only my namespace" + service_monitor_namespace_selector: None, + + // empty selector means match all ServiceMonitors in that namespace + service_monitor_selector: Some(LabelSelector::default()), + + service_discovery_role: Some("Endpoints".into()), + + pod_monitor_selector: None, + + rule_selector: None, + + rule_namespace_selector: Some(LabelSelector::default()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs new file mode 100644 index 0000000..9add9a9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs @@ -0,0 +1,62 @@ +use k8s_openapi::api::{ + core::v1::ServiceAccount, + rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject}, +}; +use kube::api::ObjectMeta; + +pub fn build_prom_role(role_name: String, namespace: String) -> Role { + Role { + metadata: ObjectMeta { + name: Some(role_name), + namespace: Some(namespace), + ..Default::default() + }, + rules: Some(vec![PolicyRule { + api_groups: Some(vec!["".into()]), // core API group + resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]), + verbs: vec!["get".into(), "list".into(), "watch".into()], + ..Default::default() + }]), + } +} + +pub fn build_prom_rolebinding( + role_name: String, + namespace: String, + service_account_name: String, +) -> RoleBinding { + RoleBinding { + metadata: ObjectMeta { + name: Some(format!("{}-rolebinding", role_name)), + namespace: Some(namespace.clone()), + ..Default::default() + }, + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".into(), + kind: "Role".into(), + name: role_name, + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".into(), + name: service_account_name, + namespace: Some(namespace.clone()), + ..Default::default() + }]), + } +} + +pub fn build_prom_service_account( + service_account_name: String, + namespace: String, +) -> ServiceAccount { + ServiceAccount { + automount_service_account_token: None, + image_pull_secrets: None, + metadata: ObjectMeta { + name: Some(service_account_name), + namespace: Some(namespace), + ..Default::default() + }, + secrets: None, + } +} diff --git a/harmony/src/modules/prometheus/mod.rs b/harmony/src/modules/prometheus/mod.rs index b77f199..c4f25ba 100644 --- a/harmony/src/modules/prometheus/mod.rs +++ b/harmony/src/modules/prometheus/mod.rs @@ -2,3 +2,4 @@ pub mod alerts; pub mod k8s_prometheus_alerting_score; #[allow(clippy::module_inception)] pub mod prometheus; +pub mod rhob_alerting_score; diff --git a/harmony/src/modules/prometheus/rhob_alerting_score.rs b/harmony/src/modules/prometheus/rhob_alerting_score.rs new file mode 100644 index 0000000..ee3e5cc --- /dev/null +++ b/harmony/src/modules/prometheus/rhob_alerting_score.rs @@ -0,0 +1,544 @@ +use std::fs; +use std::{collections::BTreeMap, sync::Arc}; +use tempfile::tempdir; + +use async_trait::async_trait; +use kube::api::ObjectMeta; +use log::{debug, info}; +use serde::Serialize; +use std::process::Command; + +use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard; +use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; +use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanagers::{ + Alertmanager, AlertmanagerSpec, +}; +use crate::modules::monitoring::kube_prometheus::crd::rhob_grafana::{ + Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, + GrafanaDatasourceSpec, GrafanaSpec, +}; +use crate::modules::monitoring::kube_prometheus::crd::rhob_monitoring_stack::{ + MonitoringStack, MonitoringStackSpec, +}; +use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheus_rules::{ + PrometheusRule, PrometheusRuleSpec, RuleGroup, +}; +use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::{ + AlertmanagerEndpoints, LabelSelector, Prometheus, PrometheusSpec, PrometheusSpecAlerting, +}; +use crate::modules::monitoring::kube_prometheus::crd::role::{ + build_prom_role, build_prom_rolebinding, build_prom_service_account, +}; +use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ + ServiceMonitor, ServiceMonitorSpec, +}; +use crate::score::Score; +use crate::topology::oberservability::monitoring::AlertReceiver; +use crate::topology::{K8sclient, Topology, k8s::K8sClient}; +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, +}; +use harmony_types::id::Id; + +use super::prometheus::PrometheusApplicationMonitoring; + +#[derive(Clone, Debug, Serialize)] +pub struct RHOBAlertingScore { + pub sender: RHOBObservability, + pub receivers: Vec>>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, +} + +impl> Score + for RHOBAlertingScore +{ + fn create_interpret(&self) -> Box> { + Box::new(RHOBAlertingInterpret { + sender: self.sender.clone(), + receivers: self.receivers.clone(), + service_monitors: self.service_monitors.clone(), + prometheus_rules: self.prometheus_rules.clone(), + }) + } + + fn name(&self) -> String { + "RHOB alerting [RHOBAlertingScore]".into() + } +} + +#[derive(Clone, Debug)] +pub struct RHOBAlertingInterpret { + pub sender: RHOBObservability, + pub receivers: Vec>>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, +} + +#[async_trait] +impl> Interpret + for RHOBAlertingInterpret +{ + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await.unwrap(); + self.ensure_grafana_operator().await?; + self.install_prometheus(&client).await?; + self.install_client_kube_metrics().await?; + // self.install_grafana(&client).await?; + // self.install_receivers(&self.sender, &self.receivers) + // .await?; + // self.install_rules(&self.prometheus_rules, &client).await?; + // self.install_monitors(self.service_monitors.clone(), &client) + // .await?; + Ok(Outcome::success( + "K8s monitoring components installed".to_string(), + )) + } + + fn get_name(&self) -> InterpretName { + InterpretName::RHOBAlerting + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl RHOBAlertingInterpret { + async fn crd_exists(&self, crd: &str) -> bool { + let status = Command::new("sh") + .args(["-c", &format!("kubectl get crd -A | grep -i {crd}")]) + .status() + .map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e))) + .unwrap(); + + status.success() + } + + async fn install_chart( + &self, + chart_path: String, + chart_name: String, + ) -> Result<(), InterpretError> { + let temp_dir = + tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?; + let temp_path = temp_dir.path().to_path_buf(); + debug!("Using temp directory: {}", temp_path.display()); + let chart = format!("{}/{}", chart_path, chart_name); + let pull_output = Command::new("helm") + .args(["pull", &chart, "--destination", temp_path.to_str().unwrap()]) + .output() + .map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?; + + if !pull_output.status.success() { + return Err(InterpretError::new(format!( + "Helm pull failed: {}", + String::from_utf8_lossy(&pull_output.stderr) + ))); + } + + let tgz_path = fs::read_dir(&temp_path) + .unwrap() + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension()? == "tgz" { + Some(path) + } else { + None + } + }) + .next() + .ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?; + + debug!("Installing chart from: {}", tgz_path.display()); + + let install_output = Command::new("helm") + .args([ + "upgrade", + "--install", + &chart_name, + tgz_path.to_str().unwrap(), + "--namespace", + &self.sender.namespace.clone(), + "--create-namespace", + "--wait", + "--atomic", + ]) + .output() + .map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?; + + if !install_output.status.success() { + return Err(InterpretError::new(format!( + "Helm install failed: {}", + String::from_utf8_lossy(&install_output.stderr) + ))); + } + + debug!( + "Installed chart {}/{} in namespace: {}", + &chart_path, + &chart_name, + self.sender.namespace.clone() + ); + Ok(()) + } + + async fn ensure_grafana_operator(&self) -> Result { + // if self.crd_exists("grafanas.grafana.integreatly.org").await { + // debug!("grafana CRDs already exist — skipping install."); + // return Ok(Outcome::success("Grafana CRDs already exist".to_string())); + // } + + let _ = Command::new("helm") + .args([ + "repo", + "add", + "grafana-operator", + "https://grafana.github.io/helm-charts", + ]) + .output() + .unwrap(); + + let _ = Command::new("helm") + .args(["repo", "update"]) + .output() + .unwrap(); + + let output = Command::new("helm") + .args([ + "install", + "grafana-operator", + "grafana-operator/grafana-operator", + "--namespace", + &self.sender.namespace.clone(), + "--create-namespace", + "--set", + "namespaceScope=true", + ]) + .output() + .unwrap(); + + if !output.status.success() { + return Err(InterpretError::new(format!( + "helm install failed:\nstdout: {}\nstderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ))); + } + + Ok(Outcome::success(format!( + "installed grafana operator in ns {}", + self.sender.namespace.clone() + ))) + } + + async fn install_prometheus(&self, client: &Arc) -> Result { + debug!( + "installing crd-prometheuses in namespace {}", + self.sender.namespace.clone() + ); + + let stack = MonitoringStack { + metadata: ObjectMeta { + name: Some(format!("{}-monitoring", self.sender.namespace.clone()).into()), + namespace: Some(self.sender.namespace.clone()), + labels: Some([("coo".into(), "example".into())].into()), + ..Default::default() + }, + spec: MonitoringStackSpec { + log_level: Some("debug".into()), + retention: Some("1d".into()), + resource_selector: Some(LabelSelector { + match_labels: [("app".into(), "demo".into())].into(), + ..Default::default() + }), + }, + }; + + client + .apply(&stack, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!("installed rhob monitoring stack",); + // let prom = Prometheus { + // metadata: ObjectMeta { + // name: Some(self.sender.namespace.clone()), + // labels: Some(std::collections::BTreeMap::from([ + // ("alertmanagerConfig".to_string(), "enabled".to_string()), + // ("client".to_string(), "prometheus".to_string()), + // ])), + // namespace: Some(self.sender.namespace.clone()), + // ..Default::default() + // }, + // spec: prom_spec, + // }; + // client + // .apply(&role, Some(&self.sender.namespace.clone())) + // .await + // .map_err(|e| InterpretError::new(e.to_string()))?; + // info!( + // "installed prometheus role: {:#?} in ns {:#?}", + // role.metadata.name.unwrap(), + // role.metadata.namespace.unwrap() + // ); + // client + // .apply(&rolebinding, Some(&self.sender.namespace.clone())) + // .await + // .map_err(|e| InterpretError::new(e.to_string()))?; + // info!( + // "installed prometheus rolebinding: {:#?} in ns {:#?}", + // rolebinding.metadata.name.unwrap(), + // rolebinding.metadata.namespace.unwrap() + // ); + // client + // .apply(&sa, Some(&self.sender.namespace.clone())) + // .await + // .map_err(|e| InterpretError::new(e.to_string()))?; + // info!( + // "installed prometheus service account: {:#?} in ns {:#?}", + // sa.metadata.name.unwrap(), + // sa.metadata.namespace.unwrap() + // ); + // client + // .apply(&prom, Some(&self.sender.namespace.clone())) + // .await + // .map_err(|e| InterpretError::new(e.to_string()))?; + // info!( + // "installed prometheus: {:#?} in ns {:#?}", + // &prom.metadata.name.clone().unwrap(), + // &prom.metadata.namespace.clone().unwrap() + // ); + // + Ok(Outcome::success(format!( + "successfully deployed rhob-prometheus {:#?}", + stack + ))) + } + + async fn install_alert_manager( + &self, + client: &Arc, + ) -> Result { + let am = Alertmanager { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: AlertmanagerSpec::default(), + }; + client + .apply(&am, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed service monitor {:#?}", + am.metadata.name + ))) + } + async fn install_monitors( + &self, + mut monitors: Vec, + client: &Arc, + ) -> Result { + let default_service_monitor = ServiceMonitor { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([ + ("alertmanagerConfig".to_string(), "enabled".to_string()), + ("client".to_string(), "prometheus".to_string()), + ( + "app.kubernetes.io/name".to_string(), + "kube-state-metrics".to_string(), + ), + ])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: ServiceMonitorSpec::default(), + }; + monitors.push(default_service_monitor); + for monitor in monitors.iter() { + client + .apply(monitor, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + } + Ok(Outcome::success( + "succesfully deployed service monitors".to_string(), + )) + } + + async fn install_rules( + &self, + #[allow(clippy::ptr_arg)] rules: &Vec, + client: &Arc, + ) -> Result { + let mut prom_rule_spec = PrometheusRuleSpec { + groups: rules.clone(), + }; + + let default_rules_group = RuleGroup { + name: "default-rules".to_string(), + rules: crate::modules::monitoring::kube_prometheus::crd::rhob_default_rules::build_default_application_rules(), + }; + + prom_rule_spec.groups.push(default_rules_group); + let prom_rules = PrometheusRule { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([ + ("alertmanagerConfig".to_string(), "enabled".to_string()), + ("role".to_string(), "prometheus-rule".to_string()), + ])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: prom_rule_spec, + }; + client + .apply(&prom_rules, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed rules {:#?}", + prom_rules.metadata.name + ))) + } + + async fn install_client_kube_metrics(&self) -> Result { + self.install_chart( + "oci://hub.nationtech.io/harmony".to_string(), + "nt-kube-metrics".to_string(), + ) + .await?; + Ok(Outcome::success(format!( + "Installed client kube metrics in ns {}", + &self.sender.namespace.clone() + ))) + } + + async fn install_grafana(&self, client: &Arc) -> Result { + let mut label = BTreeMap::new(); + label.insert("dashboards".to_string(), "grafana".to_string()); + let labels = LabelSelector { + match_labels: label.clone(), + match_expressions: vec![], + }; + let mut json_data = BTreeMap::new(); + json_data.insert("timeInterval".to_string(), "5s".to_string()); + let namespace = self.sender.namespace.clone(); + + let json = build_default_dashboard(&namespace); + + let graf_data_source = GrafanaDatasource { + metadata: ObjectMeta { + name: Some(format!( + "grafana-datasource-{}", + self.sender.namespace.clone() + )), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: GrafanaDatasourceSpec { + instance_selector: labels.clone(), + allow_cross_namespace_import: Some(false), + datasource: GrafanaDatasourceConfig { + access: "proxy".to_string(), + database: Some("prometheus".to_string()), + json_data: Some(json_data), + //this is fragile + name: format!("prometheus-{}-0", self.sender.namespace.clone()), + r#type: "prometheus".to_string(), + url: format!( + "http://prometheus-operated.{}.svc.cluster.local:9090", + self.sender.namespace.clone() + ), + }, + }, + }; + + client + .apply(&graf_data_source, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + let graf_dashboard = GrafanaDashboard { + metadata: ObjectMeta { + name: Some(format!( + "grafana-dashboard-{}", + self.sender.namespace.clone() + )), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: GrafanaDashboardSpec { + resync_period: Some("30s".to_string()), + instance_selector: labels.clone(), + json, + }, + }; + + client + .apply(&graf_dashboard, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + let grafana = Grafana { + metadata: ObjectMeta { + name: Some(format!("grafana-{}", self.sender.namespace.clone())), + namespace: Some(self.sender.namespace.clone()), + labels: Some(label.clone()), + ..Default::default() + }, + spec: GrafanaSpec { + config: None, + admin_user: None, + admin_password: None, + ingress: None, + persistence: None, + resources: None, + }, + }; + client + .apply(&grafana, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed grafana instance {:#?}", + grafana.metadata.name + ))) + } + + async fn install_receivers( + &self, + sender: &RHOBObservability, + receivers: &Vec>>, + ) -> Result { + for receiver in receivers.iter() { + receiver.install(sender).await.map_err(|err| { + InterpretError::new(format!("failed to install receiver: {}", err)) + })?; + } + Ok(Outcome::success("successfully deployed receivers".into())) + } +}