diff --git a/Cargo.lock b/Cargo.lock index 529714b..71a1a70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,6 +96,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ansi_term" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b3568b48b7cefa6b8ce125f9bb4989e52fbcc29ebea88df04cc7c5f12f70455" + [[package]] name = "anstream" version = "0.6.19" @@ -1259,6 +1265,18 @@ dependencies = [ name = "example" version = "0.0.0" +[[package]] +name = "example-application-monitoring-with-tenant" +version = "0.1.0" +dependencies = [ + "env_logger", + "harmony", + "harmony_cli", + "logging", + "tokio", + "url", +] + [[package]] name = "example-cli" version = "0.1.0" @@ -1779,6 +1797,7 @@ dependencies = [ "k3d-rs", "k8s-openapi", "kube", + "kube-derive", "lazy_static", "libredfish", "log", @@ -1791,6 +1810,7 @@ dependencies = [ "reqwest 0.11.27", "russh", "rust-ipmi", + "schemars 0.8.22", "semver", "serde", "serde-value", @@ -2669,6 +2689,7 @@ dependencies = [ "k8s-openapi", "kube-client", "kube-core", + "kube-derive", "kube-runtime", ] @@ -2722,12 +2743,27 @@ dependencies = [ "http 1.3.1", "json-patch", "k8s-openapi", + "schemars 0.8.22", "serde", "serde-value", "serde_json", "thiserror 2.0.12", ] +[[package]] +name = "kube-derive" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "079fc8c1c397538628309cfdee20696ebdcc26745f9fb17f89b78782205bd995" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn", +] + [[package]] name = "kube-runtime" version = "1.1.0" @@ -2843,6 +2879,15 @@ dependencies = [ "log", ] +[[package]] +name = "logging" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "461a8beca676e8ab1bd468c92e9b4436d6368e11e96ae038209e520cfe665e46" +dependencies = [ + "ansi_term", +] + [[package]] name = "lru" version = "0.12.5" @@ -4140,6 +4185,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "schemars" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + [[package]] name = "schemars" version = "0.9.0" @@ -4154,9 +4211,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1375ba8ef45a6f15d83fa8748f1079428295d403d6ea991d09ab100155fbc06d" +checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" dependencies = [ "dyn-clone", "ref-cast", @@ -4164,6 +4221,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "schemars_derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -4296,6 +4365,17 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_json" version = "1.0.140" @@ -4374,7 +4454,7 @@ dependencies = [ "indexmap 1.9.3", "indexmap 2.10.0", "schemars 0.9.0", - "schemars 1.0.3", + "schemars 1.0.4", "serde", "serde_derive", "serde_json", diff --git a/examples/application_monitoring_with_tenant/Cargo.toml b/examples/application_monitoring_with_tenant/Cargo.toml new file mode 100644 index 0000000..b9b63de --- /dev/null +++ b/examples/application_monitoring_with_tenant/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "example-application-monitoring-with-tenant" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +env_logger.workspace = true +harmony = { version = "0.1.0", path = "../../harmony" } +harmony_cli = { version = "0.1.0", path = "../../harmony_cli" } +logging = "0.1.0" +tokio.workspace = true +url.workspace = true diff --git a/examples/application_monitoring_with_tenant/src/main.rs b/examples/application_monitoring_with_tenant/src/main.rs new file mode 100644 index 0000000..2f10b36 --- /dev/null +++ b/examples/application_monitoring_with_tenant/src/main.rs @@ -0,0 +1,61 @@ +use std::{path::PathBuf, sync::Arc}; + +use harmony::{ + data::Id, + inventory::Inventory, + maestro::Maestro, + modules::{ + application::{ + ApplicationScore, RustWebFramework, RustWebapp, + features::{ContinuousDelivery, Monitoring}, + }, + monitoring::alert_channel::{ + discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver, + }, + tenant::TenantScore, + }, + topology::{K8sAnywhereTopology, Url, tenant::TenantConfig}, +}; + +#[tokio::main] +async fn main() { + //TODO there is a bug where the application is deployed into the namespace matching the + //application name and the tenant is created in the namesapce matching the tenant name + //in order for the application to be deployed in the tenant namespace the application.name and + //the TenantConfig.name must match + let tenant = TenantScore { + config: TenantConfig { + id: Id::from_str("test-tenant-id"), + name: "example-monitoring".to_string(), + ..Default::default() + }, + }; + let application = Arc::new(RustWebapp { + name: "example-monitoring".to_string(), + domain: Url::Url(url::Url::parse("https://rustapp.harmony.example.com").unwrap()), + project_root: PathBuf::from("./examples/rust/webapp"), + framework: Some(RustWebFramework::Leptos), + }); + + let webhook_receiver = WebhookReceiver { + name: "sample-webhook-receiver".to_string(), + url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()), + }; + + let app = ApplicationScore { + features: vec![Box::new(Monitoring { + alert_receiver: vec![Box::new(webhook_receiver)], + application: application.clone(), + })], + application, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(tenant), Box::new(app)], + None, + ) + .await + .unwrap(); +} diff --git a/examples/monitoring/src/main.rs b/examples/monitoring/src/main.rs index d59a0ef..b0a3939 100644 --- a/examples/monitoring/src/main.rs +++ b/examples/monitoring/src/main.rs @@ -50,8 +50,8 @@ async fn main() { let service_monitor_endpoint = ServiceMonitorEndpoint { port: Some("80".to_string()), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, + path: Some("/metrics".to_string()), + scheme: Some(HTTPScheme::HTTP), ..Default::default() }; diff --git a/examples/monitoring_with_tenant/src/main.rs b/examples/monitoring_with_tenant/src/main.rs index d234682..baa8cd5 100644 --- a/examples/monitoring_with_tenant/src/main.rs +++ b/examples/monitoring_with_tenant/src/main.rs @@ -53,8 +53,8 @@ async fn main() { let service_monitor_endpoint = ServiceMonitorEndpoint { port: Some("80".to_string()), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, + path: Some("/metrics".to_string()), + scheme: Some(HTTPScheme::HTTP), ..Default::default() }; diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs index 2747a3d..feb92ef 100644 --- a/examples/rust/src/main.rs +++ b/examples/rust/src/main.rs @@ -2,9 +2,14 @@ use std::{path::PathBuf, sync::Arc}; use harmony::{ inventory::Inventory, - modules::application::{ - ApplicationScore, RustWebFramework, RustWebapp, - features::{ContinuousDelivery, Monitoring}, + modules::{ + application::{ + ApplicationScore, RustWebFramework, RustWebapp, + features::{ContinuousDelivery, Monitoring}, + }, + monitoring::alert_channel::{ + discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver, + }, }, topology::{K8sAnywhereTopology, Url}, }; @@ -18,6 +23,16 @@ async fn main() { framework: Some(RustWebFramework::Leptos), }); + let discord_receiver = DiscordWebhook { + name: "test-discord".to_string(), + url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), + }; + + let webhook_receiver = WebhookReceiver { + name: "sample-webhook-receiver".to_string(), + url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()), + }; + let app = ApplicationScore { features: vec![ Box::new(ContinuousDelivery { @@ -25,7 +40,9 @@ async fn main() { }), Box::new(Monitoring { application: application.clone(), - }), // TODO: add backups, multisite ha, etc. + alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)], + }), + // TODO add backups, multisite ha, etc ], application, }; diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml index e4686ef..3d53bba 100644 --- a/harmony/Cargo.toml +++ b/harmony/Cargo.toml @@ -27,7 +27,7 @@ harmony_macros = { path = "../harmony_macros" } harmony_types = { path = "../harmony_types" } uuid.workspace = true url.workspace = true -kube.workspace = true +kube = { workspace = true, features = ["derive"] } k8s-openapi.workspace = true serde_yaml.workspace = true http.workspace = true @@ -58,6 +58,8 @@ tokio-util = "0.7.15" strum = { version = "0.27.1", features = ["derive"] } tempfile = "3.20.0" serde_with = "3.14.0" +schemars = "0.8.22" +kube-derive = "1.1.0" bollard.workspace = true tar.workspace = true base64.workspace = true diff --git a/harmony/src/domain/topology/k8s.rs b/harmony/src/domain/topology/k8s.rs index 6ab249a..0bf4ead 100644 --- a/harmony/src/domain/topology/k8s.rs +++ b/harmony/src/domain/topology/k8s.rs @@ -17,7 +17,7 @@ use kube::{ runtime::wait::await_condition, }; use log::{debug, error, trace}; -use serde::de::DeserializeOwned; +use serde::{Serialize, de::DeserializeOwned}; use similar::{DiffableStr, TextDiff}; #[derive(new, Clone)] @@ -25,6 +25,15 @@ pub struct K8sClient { client: Client, } +impl Serialize for K8sClient { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + impl std::fmt::Debug for K8sClient { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // This is a poor man's debug implementation for now as kube::Client does not provide much diff --git a/harmony/src/domain/topology/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere.rs index 6719eac..1f95204 100644 --- a/harmony/src/domain/topology/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere.rs @@ -7,22 +7,33 @@ use tokio::sync::OnceCell; use crate::{ executors::ExecutorError, - interpret::{InterpretError, Outcome}, + interpret::{InterpretError, InterpretStatus, Outcome}, inventory::Inventory, - modules::k3d::K3DInstallationScore, + modules::{ + k3d::K3DInstallationScore, + monitoring::kube_prometheus::crd::{ + crd_alertmanager_config::CRDPrometheus, + prometheus_operator::prometheus_operator_helm_chart_score, + }, + prometheus::{ + k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, + prometheus::PrometheusApplicationMonitoring, + }, + }, score::Score, }; use super::{ DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, Topology, k8s::K8sClient, + oberservability::monitoring::AlertReceiver, tenant::{TenantConfig, TenantManager, k8s::K8sTenantManager}, }; #[derive(Clone, Debug)] struct K8sState { client: Arc, - _source: K8sSource, + source: K8sSource, message: String, } @@ -56,8 +67,32 @@ impl K8sclient for K8sAnywhereTopology { } } +#[async_trait] +impl PrometheusApplicationMonitoring for K8sAnywhereTopology { + async fn install_prometheus( + &self, + sender: &CRDPrometheus, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let po_result = self.ensure_prometheus_operator(sender).await?; + + if po_result.status == InterpretStatus::NOOP { + debug!("Skipping Prometheus CR installation due to missing operator."); + return Ok(Outcome::noop()); + } + self.get_k8s_prometheus_application_score(sender.clone(), receivers) + .await + .create_interpret() + .execute(inventory, self) + .await?; + + Ok(Outcome::success(format!("No action, working on cluster "))) + } +} + impl Serialize for K8sAnywhereTopology { - fn serialize(&self, serializer: S) -> Result + fn serialize(&self, _serializer: S) -> Result where S: serde::Serializer, { @@ -82,6 +117,19 @@ impl K8sAnywhereTopology { } } + async fn get_k8s_prometheus_application_score( + &self, + sender: CRDPrometheus, + receivers: Option>>>, + ) -> K8sPrometheusCRDAlertingScore { + K8sPrometheusCRDAlertingScore { + sender, + receivers: receivers.unwrap_or_else(Vec::new), + service_monitors: vec![], + prometheus_rules: vec![], + } + } + fn is_helm_available(&self) -> Result<(), String> { let version_result = Command::new("helm") .arg("version") @@ -132,7 +180,7 @@ impl K8sAnywhereTopology { Some(client) => { return Ok(Some(K8sState { client: Arc::new(client), - _source: K8sSource::Kubeconfig, + source: K8sSource::Kubeconfig, message: format!("Loaded k8s client from kubeconfig {kubeconfig}"), })); } @@ -174,7 +222,7 @@ impl K8sAnywhereTopology { let state = match k3d.get_client().await { Ok(client) => K8sState { client: Arc::new(K8sClient::new(client)), - _source: K8sSource::LocalK3d, + source: K8sSource::LocalK3d, message: "K8s client ready".to_string(), }, Err(_) => todo!(), @@ -190,6 +238,7 @@ impl K8sAnywhereTopology { self.tenant_manager .get_or_try_init(async || -> Result { + // TOOD: checker si K8s ou K3d/s tenant manager (ref. issue https://git.nationtech.io/NationTech/harmony/issues/94) let k8s_client = self.k8s_client().await?; Ok(K8sTenantManager::new(k8s_client)) }) @@ -206,6 +255,48 @@ impl K8sAnywhereTopology { )), } } + + async fn ensure_prometheus_operator( + &self, + sender: &CRDPrometheus, + ) -> Result { + let status = Command::new("sh") + .args(["-c", "kubectl get crd -A | grep -i prometheuses"]) + .status() + .map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))?; + + if !status.success() { + if let Some(Some(k8s_state)) = self.k8s_state.get() { + match k8s_state.source { + K8sSource::LocalK3d => { + debug!("installing prometheus operator"); + let op_score = + prometheus_operator_helm_chart_score(sender.namespace.clone()); + op_score + .create_interpret() + .execute(&Inventory::empty(), self) + .await?; + return Ok(Outcome::success( + "installed prometheus operator".to_string(), + )); + } + K8sSource::Kubeconfig => { + debug!("unable to install prometheus operator, contact cluster admin"); + return Ok(Outcome::noop()); + } + } + } else { + warn!("Unable to detect k8s_state. Skipping Prometheus Operator install."); + return Ok(Outcome::noop()); + } + } + + debug!("Prometheus operator is already present, skipping install"); + + Ok(Outcome::success( + "prometheus operator present in cluster".to_string(), + )) + } } #[derive(Clone, Debug)] diff --git a/harmony/src/domain/topology/oberservability/monitoring.rs b/harmony/src/domain/topology/oberservability/monitoring.rs index 6d60c7a..7fa6eb4 100644 --- a/harmony/src/domain/topology/oberservability/monitoring.rs +++ b/harmony/src/domain/topology/oberservability/monitoring.rs @@ -1,3 +1,5 @@ +use std::any::Any; + use async_trait::async_trait; use log::debug; @@ -62,7 +64,9 @@ impl, T: Topology> Interpret for AlertingInte #[async_trait] pub trait AlertReceiver: std::fmt::Debug + Send + Sync { async fn install(&self, sender: &S) -> Result; + fn name(&self) -> String; fn clone_box(&self) -> Box>; + fn as_any(&self) -> &dyn Any; } #[async_trait] @@ -72,6 +76,6 @@ pub trait AlertRule: std::fmt::Debug + Send + Sync { } #[async_trait] -pub trait ScrapeTarger { +pub trait ScrapeTarget { async fn install(&self, sender: &S) -> Result<(), InterpretError>; } diff --git a/harmony/src/domain/topology/tenant/k8s.rs b/harmony/src/domain/topology/tenant/k8s.rs index 723c0d9..417e29c 100644 --- a/harmony/src/domain/topology/tenant/k8s.rs +++ b/harmony/src/domain/topology/tenant/k8s.rs @@ -231,8 +231,13 @@ impl K8sTenantManager { { "to": [ { + //TODO this ip is from the docker network that k3d is running on + //since k3d does not deploy kube-api-server as a pod it needs to ahve the ip + //address opened up + //need to find a way to automatically detect the ip address from the docker + //network "ipBlock": { - "cidr": "172.23.0.0/16", + "cidr": "172.24.0.0/16", } } ] diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index 126718d..4c7632c 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -1,45 +1,60 @@ use std::sync::Arc; -use async_trait::async_trait; -use base64::{Engine as _, engine::general_purpose}; -use log::{debug, info}; +use crate::modules::application::{Application, ApplicationFeature}; +use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore; +use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; use crate::{ inventory::Inventory, - modules::{ - application::{ApplicationFeature, OCICompliant}, - monitoring::{ - alert_channel::webhook_receiver::WebhookReceiver, - kube_prometheus::{ - helm_prometheus_alert_score::HelmPrometheusAlertingScore, - types::{NamespaceSelector, ServiceMonitor}, - }, - ntfy::ntfy::NtfyScore, - }, + modules::monitoring::{ + alert_channel::webhook_receiver::WebhookReceiver, ntfy::ntfy::NtfyScore, }, score::Score, topology::{HelmCommand, K8sclient, Topology, Url, tenant::TenantManager}, }; +use crate::{ + modules::prometheus::prometheus::PrometheusApplicationMonitoring, + topology::oberservability::monitoring::AlertReceiver, +}; +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use log::{debug, info}; #[derive(Debug, Clone)] pub struct Monitoring { - pub application: Arc, + pub application: Arc, + pub alert_receiver: Vec>>, } #[async_trait] -impl ApplicationFeature - for Monitoring +impl< + T: Topology + + HelmCommand + + 'static + + TenantManager + + K8sclient + + std::fmt::Debug + + PrometheusApplicationMonitoring, +> ApplicationFeature for Monitoring { async fn ensure_installed(&self, topology: &T) -> Result<(), String> { info!("Ensuring monitoring is available for application"); + let namespace = topology + .get_tenant_config() + .await + .map(|ns| ns.name.clone()) + .unwrap_or_else(|| self.application.name()); + let mut alerting_score = ApplicationMonitoringScore { + sender: CRDPrometheus { + namespace: namespace.clone(), + client: topology.k8s_client().await.unwrap(), + }, + application: self.application.clone(), + receivers: self.alert_receiver.clone(), + }; let ntfy = NtfyScore { - // namespace: topology - // .get_tenant_config() - // .await - // .expect("couldn't get tenant config") - // .name, - namespace: self.application.name(), + namespace: namespace.clone(), host: "localhost".to_string(), }; ntfy.create_interpret() @@ -70,7 +85,7 @@ impl Applicatio url::Url::parse( format!( "http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}", - self.application.name() + namespace.clone() ) .as_str(), ) @@ -78,26 +93,7 @@ impl Applicatio ), }; - let mut service_monitor = ServiceMonitor::default(); - service_monitor.namespace_selector = Some(NamespaceSelector { - any: true, - match_names: vec![], - }); - - service_monitor.name = "rust-webapp".to_string(); - - // let alerting_score = ApplicationPrometheusMonitoringScore { - // receivers: vec![Box::new(ntfy_receiver)], - // rules: vec![], - // service_monitors: vec![service_monitor], - // }; - - let alerting_score = HelmPrometheusAlertingScore { - receivers: vec![Box::new(ntfy_receiver)], - rules: vec![], - service_monitors: vec![service_monitor], - }; - + alerting_score.receivers.push(Box::new(ntfy_receiver)); alerting_score .create_interpret() .execute(&Inventory::empty(), topology) diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs index fe98d46..ec00834 100644 --- a/harmony/src/modules/application/mod.rs +++ b/harmony/src/modules/application/mod.rs @@ -10,6 +10,7 @@ pub use oci::*; pub use rust::*; use async_trait::async_trait; +use serde::Serialize; use crate::{ data::{Id, Version}, @@ -78,3 +79,12 @@ impl Interpret for Application todo!() } } + +impl Serialize for dyn Application { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs index be8f0e3..1d704a4 100644 --- a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs +++ b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs @@ -1,7 +1,16 @@ +use std::any::Any; +use std::collections::BTreeMap; + use async_trait::async_trait; +use k8s_openapi::api::core::v1::Secret; +use kube::api::ObjectMeta; use serde::Serialize; +use serde_json::json; use serde_yaml::{Mapping, Value}; +use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{ + AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, +}; use crate::{ interpret::{InterpretError, Outcome}, modules::monitoring::{ @@ -20,14 +29,98 @@ pub struct DiscordWebhook { pub url: Url, } +#[async_trait] +impl AlertReceiver for DiscordWebhook { + async fn install(&self, sender: &CRDPrometheus) -> Result { + let ns = sender.namespace.clone(); + let secret_name = format!("{}-secret", self.name.clone()); + let webhook_key = format!("{}", self.url.clone()); + + let mut string_data = BTreeMap::new(); + string_data.insert("webhook-url".to_string(), webhook_key.clone()); + + let secret = Secret { + metadata: kube::core::ObjectMeta { + name: Some(secret_name.clone()), + ..Default::default() + }, + string_data: Some(string_data), + type_: Some("Opaque".to_string()), + ..Default::default() + }; + + let _ = sender.client.apply(&secret, Some(&ns)).await; + + let spec = AlertmanagerConfigSpec { + data: json!({ + "route": { + "receiver": self.name, + }, + "receivers": [ + { + "name": self.name, + "discordConfigs": [ + { + "apiURL": { + "name": secret_name, + "key": "webhook-url", + }, + "title": "{{ template \"discord.default.title\" . }}", + "message": "{{ template \"discord.default.message\" . }}" + } + ] + } + ] + }), + }; + + let alertmanager_configs = AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(ns), + ..Default::default() + }, + spec, + }; + + sender + .client + .apply(&alertmanager_configs, Some(&sender.namespace)) + .await?; + Ok(Outcome::success(format!( + "installed crd-alertmanagerconfigs for {}", + self.name + ))) + } + fn name(&self) -> String { + "discord-webhook".to_string() + } + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + fn as_any(&self) -> &dyn Any { + self + } +} + #[async_trait] impl AlertReceiver for DiscordWebhook { async fn install(&self, sender: &Prometheus) -> Result { sender.install_receiver(self).await } + fn name(&self) -> String { + "discord-webhook".to_string() + } fn clone_box(&self) -> Box> { Box::new(self.clone()) } + fn as_any(&self) -> &dyn Any { + self + } } #[async_trait] @@ -48,6 +141,12 @@ impl AlertReceiver for DiscordWebhook { fn clone_box(&self) -> Box> { Box::new(self.clone()) } + fn name(&self) -> String { + "discord-webhook".to_string() + } + fn as_any(&self) -> &dyn Any { + self + } } #[async_trait] diff --git a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs index f844431..9a9d5d2 100644 --- a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs +++ b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs @@ -1,11 +1,19 @@ +use std::any::Any; + use async_trait::async_trait; +use kube::api::ObjectMeta; +use log::debug; use serde::Serialize; +use serde_json::json; use serde_yaml::{Mapping, Value}; use crate::{ interpret::{InterpretError, Outcome}, modules::monitoring::{ kube_prometheus::{ + crd::crd_alertmanager_config::{ + AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, + }, prometheus::{KubePrometheus, KubePrometheusReceiver}, types::{AlertChannelConfig, AlertManagerChannelConfig}, }, @@ -20,14 +28,81 @@ pub struct WebhookReceiver { pub url: Url, } +#[async_trait] +impl AlertReceiver for WebhookReceiver { + async fn install(&self, sender: &CRDPrometheus) -> Result { + let spec = AlertmanagerConfigSpec { + data: json!({ + "route": { + "receiver": self.name, + }, + "receivers": [ + { + "name": self.name, + "webhookConfigs": [ + { + "url": self.url, + } + ] + } + ] + }), + }; + + let alertmanager_configs = AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(sender.namespace.clone()), + ..Default::default() + }, + spec, + }; + debug!( + "alert manager configs: \n{:#?}", + alertmanager_configs.clone() + ); + + sender + .client + .apply(&alertmanager_configs, Some(&sender.namespace)) + .await?; + Ok(Outcome::success(format!( + "installed crd-alertmanagerconfigs for {}", + self.name + ))) + } + + fn name(&self) -> String { + "webhook-receiver".to_string() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + #[async_trait] impl AlertReceiver for WebhookReceiver { async fn install(&self, sender: &Prometheus) -> Result { sender.install_receiver(self).await } + fn name(&self) -> String { + "webhook-receiver".to_string() + } fn clone_box(&self) -> Box> { Box::new(self.clone()) } + fn as_any(&self) -> &dyn Any { + self + } } #[async_trait] @@ -44,9 +119,15 @@ impl AlertReceiver for WebhookReceiver { async fn install(&self, sender: &KubePrometheus) -> Result { sender.install_receiver(self).await } + fn name(&self) -> String { + "webhook-receiver".to_string() + } fn clone_box(&self) -> Box> { Box::new(self.clone()) } + fn as_any(&self) -> &dyn Any { + self + } } #[async_trait] diff --git a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs new file mode 100644 index 0000000..f888b19 --- /dev/null +++ b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs @@ -0,0 +1,78 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + data::{Id, Version}, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::{ + application::Application, + monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus, + prometheus::prometheus::PrometheusApplicationMonitoring, + }, + score::Score, + topology::{Topology, oberservability::monitoring::AlertReceiver}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct ApplicationMonitoringScore { + pub sender: CRDPrometheus, + pub application: Arc, + pub receivers: Vec>>, +} + +impl> Score + for ApplicationMonitoringScore +{ + fn create_interpret(&self) -> Box> { + Box::new(ApplicationMonitoringInterpret { + score: self.clone(), + }) + } + + fn name(&self) -> String { + "ApplicationMonitoringScore".to_string() + } +} + +#[derive(Debug)] +pub struct ApplicationMonitoringInterpret { + score: ApplicationMonitoringScore, +} + +#[async_trait] +impl> Interpret + for ApplicationMonitoringInterpret +{ + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + topology + .install_prometheus( + &self.score.sender, + inventory, + Some(self.score.receivers.clone()), + ) + .await + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs deleted file mode 100644 index f4a6c1b..0000000 --- a/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs +++ /dev/null @@ -1,44 +0,0 @@ -use std::sync::{Arc, Mutex}; - -use serde::Serialize; - -use crate::{ - modules::monitoring::{ - kube_prometheus::types::ServiceMonitor, - prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig}, - }, - score::Score, - topology::{ - HelmCommand, Topology, - oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, - tenant::TenantManager, - }, -}; - -#[derive(Clone, Debug, Serialize)] -pub struct ApplicationPrometheusMonitoringScore { - pub receivers: Vec>>, - pub rules: Vec>>, - pub service_monitors: Vec, -} - -impl Score for ApplicationPrometheusMonitoringScore { - fn create_interpret(&self) -> Box> { - let mut prom_config = PrometheusConfig::new(); - prom_config.alert_manager = true; - - let config = Arc::new(Mutex::new(prom_config)); - config - .try_lock() - .expect("couldn't lock config") - .additional_service_monitors = self.service_monitors.clone(); - Box::new(AlertingInterpret { - sender: Prometheus::new(), - receivers: self.receivers.clone(), - rules: self.rules.clone(), - }) - } - fn name(&self) -> String { - "ApplicationPrometheusMonitoringScore".to_string() - } -} diff --git a/harmony/src/modules/monitoring/application_monitoring/mod.rs b/harmony/src/modules/monitoring/application_monitoring/mod.rs index d9a313b..c243cd7 100644 --- a/harmony/src/modules/monitoring/application_monitoring/mod.rs +++ b/harmony/src/modules/monitoring/application_monitoring/mod.rs @@ -1 +1 @@ -pub mod k8s_application_monitoring_score; +pub mod application_monitoring_score; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs new file mode 100644 index 0000000..2165a4a --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs @@ -0,0 +1,50 @@ +use std::sync::Arc; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::topology::{ + k8s::K8sClient, + oberservability::monitoring::{AlertReceiver, AlertSender}, +}; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1alpha1", + kind = "AlertmanagerConfig", + plural = "alertmanagerconfigs", + namespaced +)] +pub struct AlertmanagerConfigSpec { + #[serde(flatten)] + pub data: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize)] +pub struct CRDPrometheus { + pub namespace: String, + pub client: Arc, +} + +impl AlertSender for CRDPrometheus { + fn name(&self) -> String { + "CRDAlertManager".to_string() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanagers.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanagers.rs new file mode 100644 index 0000000..637490d --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanagers.rs @@ -0,0 +1,53 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +use super::crd_prometheuses::LabelSelector; + +/// Rust CRD for `Alertmanager` from Prometheus Operator +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "Alertmanager", + plural = "alertmanagers", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct AlertmanagerSpec { + /// Number of replicas for HA + pub replicas: i32, + + /// Selectors for AlertmanagerConfig CRDs + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alertmanager_config_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alertmanager_config_namespace_selector: Option, + + /// Optional pod template metadata (annotations, labels) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_metadata: Option, + + /// Optional topology spread settings + #[serde(default, skip_serializing_if = "Option::is_none")] + pub version: Option, +} + +impl Default for AlertmanagerSpec { + fn default() -> Self { + AlertmanagerSpec { + replicas: 1, + + // Match all AlertmanagerConfigs in the same namespace + alertmanager_config_namespace_selector: None, + + // Empty selector matches all AlertmanagerConfigs in that namespace + alertmanager_config_selector: Some(LabelSelector::default()), + + pod_metadata: None, + version: None, + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs new file mode 100644 index 0000000..a245a86 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs @@ -0,0 +1,30 @@ +use std::collections::BTreeMap; + +use crate::modules::{ + monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule, + prometheus::alerts::k8s::{ + deployment::alert_deployment_unavailable, + pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, + pvc::high_pvc_fill_rate_over_two_days, + service::alert_service_down, + }, +}; + +use super::crd_prometheus_rules::Rule; + +pub fn build_default_application_rules() -> Vec { + let pod_failed: Rule = pod_failed().into(); + let container_restarting: Rule = alert_container_restarting().into(); + let pod_not_ready: Rule = alert_pod_not_ready().into(); + let service_down: Rule = alert_service_down().into(); + let deployment_unavailable: Rule = alert_deployment_unavailable().into(); + let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into(); + vec![ + pod_failed, + container_restarting, + pod_not_ready, + service_down, + deployment_unavailable, + high_pvc_fill_rate, + ] +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs new file mode 100644 index 0000000..793f639 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -0,0 +1,153 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::crd_prometheuses::LabelSelector; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "Grafana", + plural = "grafanas", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub config: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ingress: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub persistence: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resources: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub log: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub security: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaLogConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mode: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub level: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSecurityConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngress { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hosts: Option>, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaPersistence { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub storage_class_name: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub size: Option, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDashboard", + plural = "grafanadashboards", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDashboardSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resync_period: Option, + + pub instance_selector: LabelSelector, + + pub json: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDatasource", + plural = "grafanadatasources", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceSpec { + pub instance_selector: LabelSelector, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub allow_cross_namespace_import: Option, + + pub datasource: GrafanaDatasourceConfig, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceConfig { + pub access: String, + pub database: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub json_data: Option>, + pub name: String, + pub r#type: String, + pub url: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct ResourceRequirements { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub limits: BTreeMap, + + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub requests: BTreeMap, +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs new file mode 100644 index 0000000..c0ee69e --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs @@ -0,0 +1,59 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +use super::crd_default_rules::build_default_application_rules; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "PrometheusRule", + plural = "prometheusrules", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusRuleSpec { + pub groups: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct RuleGroup { + pub name: String, + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct Rule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alert: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expr: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub for_: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option>, +} + +impl From for Rule { + fn from(value: PrometheusAlertRule) -> Self { + Rule { + alert: Some(value.alert), + expr: Some(value.expr), + for_: value.r#for, + labels: Some(value.labels.into_iter().collect::>()), + annotations: Some(value.annotations.into_iter().collect::>()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs new file mode 100644 index 0000000..90b2e8c --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs @@ -0,0 +1,118 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::kube_prometheus::types::Operator; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "Prometheus", + plural = "prometheuses", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alerting: Option, + + pub service_account_name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_namespace_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_discovery_role: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_namespace_selector: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct NamespaceSelector { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_names: Vec, +} + +/// Contains alerting configuration, specifically Alertmanager endpoints. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +pub struct PrometheusSpecAlerting { + #[serde(skip_serializing_if = "Option::is_none")] + pub alertmanagers: Option>, +} + +/// Represents an Alertmanager endpoint configuration used by Prometheus. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +pub struct AlertmanagerEndpoints { + /// Name of the Alertmanager Service. + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, + + /// Namespace of the Alertmanager Service. + #[serde(skip_serializing_if = "Option::is_none")] + pub namespace: Option, + + /// Port to access on the Alertmanager Service (e.g. "web"). + #[serde(skip_serializing_if = "Option::is_none")] + pub port: Option, + + /// Scheme to use for connecting (e.g. "http"). + #[serde(skip_serializing_if = "Option::is_none")] + pub scheme: Option, + // Other fields like `tls_config`, `path_prefix`, etc., can be added if needed. +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub match_labels: BTreeMap, + + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_expressions: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelectorRequirement { + pub key: String, + pub operator: Operator, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub values: Vec, +} + +impl Default for PrometheusSpec { + fn default() -> Self { + PrometheusSpec { + alerting: None, + + service_account_name: "prometheus".into(), + + // null means "only my namespace" + service_monitor_namespace_selector: None, + + // empty selector means match all ServiceMonitors in that namespace + service_monitor_selector: Some(LabelSelector::default()), + + service_discovery_role: Some("Endpoints".into()), + + pod_monitor_selector: None, + + rule_selector: None, + + rule_namespace_selector: Some(LabelSelector::default()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs new file mode 100644 index 0000000..63fffa9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs @@ -0,0 +1,203 @@ +pub fn build_default_dashboard(namespace: &str) -> String { + let dashboard = format!( + r#"{{ + "annotations": {{ + "list": [] + }}, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 171105, + "panels": [ + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 6, + "x": 0, + "y": 0 + }}, + "id": 1, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Pods in Namespace", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 6, + "x": 6, + "y": 0 + }}, + "id": 2, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Pods in Failed State", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "percentunit" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 6 + }}, + "id": 3, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Deployment Health (Available / Desired)", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }}, + "id": 4, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))", + "legendFormat": "{{{{pod}}}}", + "refId": "A" + }} + ], + "title": "Container Restarts (per pod)", + "type": "timeseries" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }}, + "id": 5, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Firing Alerts in Namespace", + "type": "stat" + }} + ], + "schemaVersion": 36, + "templating": {{ + "list": [ + {{ + "name": "datasource", + "type": "datasource", + "pluginId": "prometheus", + "label": "Prometheus", + "query": "prometheus", + "refresh": 1, + "hide": 0, + "current": {{ + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }} + }} + ] + }}, + "title": "Tenant Namespace Overview", + "version": 1 +}}"# + ); + dashboard +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs new file mode 100644 index 0000000..ac7c9f5 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs @@ -0,0 +1,20 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; + +pub fn grafana_operator_helm_chart_score(ns: String) -> HelmChartScore { + HelmChartScore { + namespace: Some(NonBlankString::from_str(&ns).unwrap()), + release_name: NonBlankString::from_str("grafana_operator").unwrap(), + chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana-operator") + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: None, + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs new file mode 100644 index 0000000..236a2de --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs @@ -0,0 +1,11 @@ +pub mod crd_alertmanager_config; +pub mod crd_alertmanagers; +pub mod crd_default_rules; +pub mod crd_grafana; +pub mod crd_prometheus_rules; +pub mod crd_prometheuses; +pub mod grafana_default_dashboard; +pub mod grafana_operator; +pub mod prometheus_operator; +pub mod role; +pub mod service_monitor; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/prometheus_operator.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/prometheus_operator.rs new file mode 100644 index 0000000..413c254 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/prometheus_operator.rs @@ -0,0 +1,22 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; + +pub fn prometheus_operator_helm_chart_score(ns: String) -> HelmChartScore { + HelmChartScore { + namespace: Some(NonBlankString::from_str(&ns).unwrap()), + release_name: NonBlankString::from_str("prometheus-operator").unwrap(), + chart_name: NonBlankString::from_str( + "oci://hub.nationtech.io/harmony/nt-prometheus-operator", + ) + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: None, + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs new file mode 100644 index 0000000..9add9a9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs @@ -0,0 +1,62 @@ +use k8s_openapi::api::{ + core::v1::ServiceAccount, + rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject}, +}; +use kube::api::ObjectMeta; + +pub fn build_prom_role(role_name: String, namespace: String) -> Role { + Role { + metadata: ObjectMeta { + name: Some(role_name), + namespace: Some(namespace), + ..Default::default() + }, + rules: Some(vec![PolicyRule { + api_groups: Some(vec!["".into()]), // core API group + resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]), + verbs: vec!["get".into(), "list".into(), "watch".into()], + ..Default::default() + }]), + } +} + +pub fn build_prom_rolebinding( + role_name: String, + namespace: String, + service_account_name: String, +) -> RoleBinding { + RoleBinding { + metadata: ObjectMeta { + name: Some(format!("{}-rolebinding", role_name)), + namespace: Some(namespace.clone()), + ..Default::default() + }, + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".into(), + kind: "Role".into(), + name: role_name, + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".into(), + name: service_account_name, + namespace: Some(namespace.clone()), + ..Default::default() + }]), + } +} + +pub fn build_prom_service_account( + service_account_name: String, + namespace: String, +) -> ServiceAccount { + ServiceAccount { + automount_service_account_token: None, + image_pull_secrets: None, + metadata: ObjectMeta { + name: Some(service_account_name), + namespace: Some(namespace), + ..Default::default() + }, + secrets: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/service_monitor.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/service_monitor.rs new file mode 100644 index 0000000..7c613e7 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/service_monitor.rs @@ -0,0 +1,89 @@ +use std::collections::{BTreeMap, HashMap}; + +use kube::{CustomResource, Resource, api::ObjectMeta}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::interpret::InterpretError; + +use crate::modules::monitoring::kube_prometheus::types::{ + HTTPScheme, MatchExpression, NamespaceSelector, Operator, Selector, + ServiceMonitor as KubeServiceMonitor, ServiceMonitorEndpoint, +}; + +/// This is the top-level struct for the ServiceMonitor Custom Resource. +/// The `#[derive(CustomResource)]` macro handles all the boilerplate for you, +/// including the `impl Resource`. +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "ServiceMonitor", + plural = "servicemonitors", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct ServiceMonitorSpec { + /// A label selector to select services to monitor. + pub selector: Selector, + + /// A list of endpoints on the selected services to be monitored. + pub endpoints: Vec, + + /// Selector to select which namespaces the Kubernetes Endpoints objects + /// are discovered from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub namespace_selector: Option, + + /// The label to use to retrieve the job name from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub job_label: Option, + + /// Pod-based target labels to transfer from the Kubernetes Pod onto the target. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub pod_target_labels: Vec, + + /// TargetLabels transfers labels on the Kubernetes Service object to the target. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub target_labels: Vec, +} + +impl Default for ServiceMonitorSpec { + fn default() -> Self { + let mut labels = HashMap::new(); + Self { + selector: Selector { + match_labels: { labels }, + match_expressions: vec![MatchExpression { + key: "app.kubernetes.io/name".into(), + operator: Operator::Exists, + values: vec![], + }], + }, + endpoints: vec![ServiceMonitorEndpoint { + port: Some("http".to_string()), + path: Some("/metrics".into()), + interval: Some("30s".into()), + scheme: Some(HTTPScheme::HTTP), + ..Default::default() + }], + namespace_selector: None, // only the same namespace + job_label: Some("app".into()), + pod_target_labels: vec![], + target_labels: vec![], + } + } +} + +impl From for ServiceMonitorSpec { + fn from(value: KubeServiceMonitor) -> Self { + Self { + selector: value.selector, + endpoints: value.endpoints, + namespace_selector: value.namespace_selector, + job_label: value.job_label, + pod_target_labels: value.pod_target_labels, + target_labels: value.target_labels, + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs b/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs index 041e5f0..3c4fa37 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs @@ -35,7 +35,7 @@ impl KubePrometheusConfig { windows_monitoring: false, alert_manager: true, grafana: true, - node_exporter: false, + node_exporter: true, prometheus: true, kubernetes_service_monitors: true, kubernetes_api_server: true, diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs index 22b2f7a..51e918c 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs @@ -12,8 +12,8 @@ use crate::modules::{ helm::chart::HelmChartScore, monitoring::kube_prometheus::types::{ AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig, - AlertManagerRoute, AlertManagerSpec, AlertManagerValues, ConfigReloader, Limits, - PrometheusConfig, Requests, Resources, + AlertManagerConfigSelector, AlertManagerRoute, AlertManagerSpec, AlertManagerValues, + ConfigReloader, Limits, PrometheusConfig, Requests, Resources, }, }; @@ -332,6 +332,11 @@ prometheusOperator: .push(receiver.channel_receiver.clone()); } + let mut labels = BTreeMap::new(); + labels.insert("alertmanagerConfig".to_string(), "enabled".to_string()); + let alert_manager_config_selector = AlertManagerConfigSelector { + match_labels: labels, + }; let alert_manager_values = AlertManagerValues { alertmanager: AlertManager { enabled: config.alert_manager, @@ -347,6 +352,8 @@ prometheusOperator: cpu: "100m".to_string(), }, }, + alert_manager_config_selector, + replicas: 2, }, init_config_reloader: ConfigReloader { resources: Resources { diff --git a/harmony/src/modules/monitoring/kube_prometheus/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/mod.rs index 7c8233a..122e939 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/mod.rs @@ -1,3 +1,4 @@ +pub mod crd; pub mod helm; pub mod helm_prometheus_alert_score; pub mod prometheus; diff --git a/harmony/src/modules/monitoring/kube_prometheus/types.rs b/harmony/src/modules/monitoring/kube_prometheus/types.rs index 33bfcc3..abe5896 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/types.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs @@ -1,7 +1,8 @@ use std::collections::{BTreeMap, HashMap}; use async_trait::async_trait; -use serde::Serialize; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use serde_yaml::{Mapping, Sequence, Value}; use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; @@ -55,6 +56,14 @@ pub struct AlertManagerChannelConfig { #[serde(rename_all = "camelCase")] pub struct AlertManagerSpec { pub(crate) resources: Resources, + pub replicas: u32, + pub alert_manager_config_selector: AlertManagerConfigSelector, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct AlertManagerConfigSelector { + pub match_labels: BTreeMap, } #[derive(Debug, Clone, Serialize)] @@ -86,7 +95,7 @@ pub struct AlertGroup { pub groups: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub enum HTTPScheme { #[serde(rename = "http")] HTTP, @@ -94,7 +103,7 @@ pub enum HTTPScheme { HTTPS, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub enum Operator { In, NotIn, @@ -139,74 +148,83 @@ pub struct ServiceMonitorTLSConfig { pub server_name: Option, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] #[serde(rename_all = "camelCase")] pub struct ServiceMonitorEndpoint { - // ## Name of the endpoint's service port - // ## Mutually exclusive with targetPort + /// Name of the service port this endpoint refers to. pub port: Option, - // ## Name or number of the endpoint's target port - // ## Mutually exclusive with port - pub target_port: Option, - - // ## File containing bearer token to be used when scraping targets - // ## - pub bearer_token_file: Option, - - // ## Interval at which metrics should be scraped - // ## + /// Interval at which metrics should be scraped. + #[serde(default, skip_serializing_if = "Option::is_none")] pub interval: Option, - // ## HTTP path to scrape for metrics - // ## - pub path: String, + /// The HTTP path to scrape for metrics. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, - // ## HTTP scheme to use for scraping - // ## - pub scheme: HTTPScheme, + /// HTTP scheme to use for scraping. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scheme: Option, - // ## TLS configuration to use when scraping the endpoint - // ## - pub tls_config: Option, + /// Relabelings to apply to samples before scraping. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub relabelings: Vec, - // ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. - // ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig - // ## - // # - action: keep - // # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' - // # sourceLabels: [__name__] - pub metric_relabelings: Vec, - - // ## RelabelConfigs to apply to samples before scraping - // ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig - // ## - // # - sourceLabels: [__meta_kubernetes_pod_node_name] - // # separator: ; - // # regex: ^(.*)$ - // # targetLabel: nodename - // # replacement: $1 - // # action: replace - pub relabelings: Vec, + /// MetricRelabelings to apply to samples after scraping, but before ingestion. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub metric_relabelings: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct RelabelConfig { + /// The action to perform based on the regex matching. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub action: Option, + + /// A list of labels from which to extract values. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub source_labels: Vec, + + /// Separator to be used when concatenating source_labels. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub separator: Option, + + /// The label to which the resulting value is written. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_label: Option, + + /// A regular expression to match against the concatenated source label values. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub regex: Option, + + /// The replacement value to use. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replacement: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct MatchExpression { pub key: String, - pub operator: Operator, + pub operator: Operator, // "In", "NotIn", "Exists", "DoesNotExist" + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub values: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] #[serde(rename_all = "camelCase")] pub struct Selector { - // # label selector for services + /// A map of key-value pairs to match. + #[serde(default, skip_serializing_if = "HashMap::is_empty")] pub match_labels: HashMap, + + /// A list of label selector requirements. + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub match_expressions: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ServiceMonitor { pub name: String, @@ -250,10 +268,15 @@ pub struct ServiceMonitor { pub fallback_scrape_protocol: Option, } -#[derive(Debug, Serialize, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] #[serde(rename_all = "camelCase")] pub struct NamespaceSelector { + /// Select all namespaces. + #[serde(default, skip_serializing_if = "std::ops::Not::not")] pub any: bool, + + /// List of namespace names to select from. + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub match_names: Vec, } @@ -275,19 +298,3 @@ impl Default for ServiceMonitor { } } } - -impl Default for ServiceMonitorEndpoint { - fn default() -> Self { - Self { - port: Some("80".to_string()), - target_port: Default::default(), - bearer_token_file: Default::default(), - interval: Default::default(), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, - tls_config: Default::default(), - metric_relabelings: Default::default(), - relabelings: Default::default(), - } - } -} diff --git a/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs b/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs index 076a8a3..d94a78d 100644 --- a/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs +++ b/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs @@ -58,6 +58,7 @@ config: # web-root: "disable" enable-signup: false enable-login: "true" + enable-metrics: "true" persistence: enabled: true diff --git a/harmony/src/modules/prometheus/alerts/k8s/deployment.rs b/harmony/src/modules/prometheus/alerts/k8s/deployment.rs new file mode 100644 index 0000000..6e30f5f --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/deployment.rs @@ -0,0 +1,23 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_deployment_unavailable() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "DeploymentUnavailable".into(), + expr: "kube_deployment_status_replicas_unavailable > 0".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ( + "summary".into(), + "Deployment has unavailable replicas".into(), + ), + ( + "description".into(), + "A deployment in this namespace has unavailable replicas for over 2 minutes." + .into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs b/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs new file mode 100644 index 0000000..11d65c9 --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs @@ -0,0 +1,37 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_high_memory_usage() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "HighMemoryUsage".into(), + expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000" + .into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is using high memory".into()), + ( + "description".into(), + "A pod is consuming more than 500Mi of memory.".into(), + ), + ]), + } +} + +pub fn alert_high_cpu_usage() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "HighCPUUsage".into(), + expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9" + .into(), + r#for: Some("1m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is using high CPU".into()), + ( + "description".into(), + "A pod is using more than 90% of a core over 1 minute.".into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/mod.rs b/harmony/src/modules/prometheus/alerts/k8s/mod.rs index f01a9c8..0e3314b 100644 --- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs +++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs @@ -1 +1,5 @@ +pub mod deployment; +pub mod memory_usage; +pub mod pod; pub mod pvc; +pub mod service; diff --git a/harmony/src/modules/prometheus/alerts/k8s/pod.rs b/harmony/src/modules/prometheus/alerts/k8s/pod.rs new file mode 100644 index 0000000..152ec2f --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/pod.rs @@ -0,0 +1,55 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn pod_failed() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "PodFailed".into(), + expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "critical".into())]), + annotations: HashMap::from([ + ("summary".into(), "A pod has failed".into()), + ( + "description".into(), + "One or more pods are in Failed phase.".into(), + ), + ]), + } +} + +pub fn alert_container_restarting() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "ContainerRestarting".into(), + expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(), + r#for: Some("5m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ( + "summary".into(), + "Container is restarting frequently".into(), + ), + ( + "description".into(), + "A container in this namespace has restarted more than 3 times in 5 minutes." + .into(), + ), + ]), + } +} + +pub fn alert_pod_not_ready() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "PodNotReady".into(), + expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is not ready".into()), + ( + "description".into(), + "A pod in the namespace is not reporting Ready status.".into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/service.rs b/harmony/src/modules/prometheus/alerts/k8s/service.rs new file mode 100644 index 0000000..5a56761 --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/service.rs @@ -0,0 +1,19 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_service_down() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "ServiceDown".into(), + expr: "up == 0".into(), + r#for: Some("1m".into()), + labels: HashMap::from([("severity".into(), "critical".into())]), + annotations: HashMap::from([ + ("summary".into(), "Service is down".into()), + ( + "description".into(), + "A target service in the namespace is not responding to Prometheus scrapes.".into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs new file mode 100644 index 0000000..2aace1f --- /dev/null +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -0,0 +1,569 @@ +use std::fs; +use std::{collections::BTreeMap, sync::Arc}; +use tempfile::tempdir; + +use async_trait::async_trait; +use kube::api::ObjectMeta; +use log::{debug, info}; +use serde::Serialize; +use std::process::Command; + +use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; +use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; +use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{ + Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, + GrafanaDatasourceSpec, GrafanaSpec, +}; +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ + PrometheusRule, PrometheusRuleSpec, RuleGroup, +}; +use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard; +use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ + ServiceMonitor, ServiceMonitorSpec, +}; +use crate::topology::oberservability::monitoring::AlertReceiver; +use crate::topology::{K8sclient, Topology, k8s::K8sClient}; +use crate::{ + data::{Id, Version}, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::kube_prometheus::crd::{ + crd_alertmanagers::{Alertmanager, AlertmanagerSpec}, + crd_prometheuses::{ + AlertmanagerEndpoints, LabelSelector, Prometheus, PrometheusSpec, + PrometheusSpecAlerting, + }, + role::{build_prom_role, build_prom_rolebinding, build_prom_service_account}, + }, + score::Score, +}; + +use super::prometheus::PrometheusApplicationMonitoring; + +#[derive(Clone, Debug, Serialize)] +pub struct K8sPrometheusCRDAlertingScore { + pub sender: CRDPrometheus, + pub receivers: Vec>>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, +} + +impl> Score + for K8sPrometheusCRDAlertingScore +{ + fn create_interpret(&self) -> Box> { + Box::new(K8sPrometheusCRDAlertingInterpret { + sender: self.sender.clone(), + receivers: self.receivers.clone(), + service_monitors: self.service_monitors.clone(), + prometheus_rules: self.prometheus_rules.clone(), + }) + } + + fn name(&self) -> String { + "CRDApplicationAlertingScore".into() + } +} + +#[derive(Clone, Debug)] +pub struct K8sPrometheusCRDAlertingInterpret { + pub sender: CRDPrometheus, + pub receivers: Vec>>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, +} + +#[async_trait] +impl> Interpret + for K8sPrometheusCRDAlertingInterpret +{ + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await.unwrap(); + self.ensure_grafana_operator().await?; + self.install_prometheus(&client).await?; + self.install_alert_manager(&client).await?; + self.install_client_kube_metrics().await?; + self.install_grafana(&client).await?; + self.install_receivers(&self.sender, &self.receivers) + .await?; + self.install_rules(&self.prometheus_rules, &client).await?; + self.install_monitors(self.service_monitors.clone(), &client) + .await?; + Ok(Outcome::success(format!( + "deployed application monitoring composants" + ))) + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl K8sPrometheusCRDAlertingInterpret { + async fn crd_exists(&self, crd: &str) -> bool { + let status = Command::new("sh") + .args(["-c", "kubectl get crd -A | grep -i", crd]) + .status() + .map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e))) + .unwrap(); + + status.success() + } + + async fn install_chart( + &self, + chart_path: String, + chart_name: String, + ) -> Result<(), InterpretError> { + let temp_dir = + tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?; + let temp_path = temp_dir.path().to_path_buf(); + debug!("Using temp directory: {}", temp_path.display()); + let chart = format!("{}/{}", chart_path, chart_name); + let pull_output = Command::new("helm") + .args(["pull", &chart, "--destination", temp_path.to_str().unwrap()]) + .output() + .map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?; + + if !pull_output.status.success() { + return Err(InterpretError::new(format!( + "Helm pull failed: {}", + String::from_utf8_lossy(&pull_output.stderr) + ))); + } + + let tgz_path = fs::read_dir(&temp_path) + .unwrap() + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension()? == "tgz" { + Some(path) + } else { + None + } + }) + .next() + .ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?; + + debug!("Installing chart from: {}", tgz_path.display()); + + let install_output = Command::new("helm") + .args([ + "install", + &chart_name, + tgz_path.to_str().unwrap(), + "--namespace", + &self.sender.namespace.clone(), + "--create-namespace", + "--wait", + "--atomic", + ]) + .output() + .map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?; + + if !install_output.status.success() { + return Err(InterpretError::new(format!( + "Helm install failed: {}", + String::from_utf8_lossy(&install_output.stderr) + ))); + } + + debug!( + "Installed chart {}/{} in namespace: {}", + &chart_path, + &chart_name, + self.sender.namespace.clone() + ); + Ok(()) + } + + async fn ensure_grafana_operator(&self) -> Result { + if self.crd_exists("grafanas.grafana.integreatly.org").await { + debug!("grafana CRDs already exist — skipping install."); + return Ok(Outcome::success("Grafana CRDs already exist".to_string())); + } + + let _ = Command::new("helm") + .args([ + "repo", + "add", + "grafana-operator", + "https://grafana.github.io/helm-charts", + ]) + .output() + .unwrap(); + + let _ = Command::new("helm") + .args(["repo", "update"]) + .output() + .unwrap(); + + let output = Command::new("helm") + .args([ + "install", + "grafana-operator", + "grafana-operator/grafana-operator", + "--namespace", + &self.sender.namespace.clone(), + "--create-namespace", + "--set", + "namespaceScope=true", + ]) + .output() + .unwrap(); + + if !output.status.success() { + return Err(InterpretError::new(format!( + "helm install failed:\nstdout: {}\nstderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ))); + } + + Ok(Outcome::success(format!( + "installed grafana operator in ns {}", + self.sender.namespace.clone() + ))) + } + + async fn install_prometheus(&self, client: &Arc) -> Result { + debug!( + "installing crd-prometheuses in namespace {}", + self.sender.namespace.clone() + ); + debug!("building role/rolebinding/serviceaccount for crd-prometheus"); + let rolename = format!("{}-prom", self.sender.namespace.clone()); + let sa_name = format!("{}-prom-sa", self.sender.namespace.clone()); + let role = build_prom_role(rolename.clone(), self.sender.namespace.clone()); + let rolebinding = build_prom_rolebinding( + rolename.clone(), + self.sender.namespace.clone(), + sa_name.clone(), + ); + let sa = build_prom_service_account(sa_name.clone(), self.sender.namespace.clone()); + let prom_spec = PrometheusSpec { + alerting: Some(PrometheusSpecAlerting { + alertmanagers: Some(vec![AlertmanagerEndpoints { + name: Some("alertmanager-operated".into()), + namespace: Some(self.sender.namespace.clone()), + port: Some("web".into()), + scheme: Some("http".into()), + }]), + }), + service_account_name: sa_name.clone(), + service_monitor_namespace_selector: Some(LabelSelector { + match_labels: BTreeMap::from([( + "kubernetes.io/metadata.name".to_string(), + self.sender.namespace.clone(), + )]), + match_expressions: vec![], + }), + service_monitor_selector: Some(LabelSelector { + match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]), + ..Default::default() + }), + + service_discovery_role: Some("Endpoints".into()), + + pod_monitor_selector: Some(LabelSelector { + match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]), + ..Default::default() + }), + + rule_selector: Some(LabelSelector { + match_labels: BTreeMap::from([("role".to_string(), "prometheus-rule".to_string())]), + ..Default::default() + }), + + rule_namespace_selector: Some(LabelSelector { + match_labels: BTreeMap::from([( + "kubernetes.io/metadata.name".to_string(), + self.sender.namespace.clone(), + )]), + match_expressions: vec![], + }), + }; + let prom = Prometheus { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([ + ("alertmanagerConfig".to_string(), "enabled".to_string()), + ("client".to_string(), "prometheus".to_string()), + ])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: prom_spec, + }; + client + .apply(&role, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus role: {:#?} in ns {:#?}", + role.metadata.name.unwrap(), + role.metadata.namespace.unwrap() + ); + client + .apply(&rolebinding, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus rolebinding: {:#?} in ns {:#?}", + rolebinding.metadata.name.unwrap(), + rolebinding.metadata.namespace.unwrap() + ); + client + .apply(&sa, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus service account: {:#?} in ns {:#?}", + sa.metadata.name.unwrap(), + sa.metadata.namespace.unwrap() + ); + client + .apply(&prom, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus: {:#?} in ns {:#?}", + &prom.metadata.name.clone().unwrap(), + &prom.metadata.namespace.clone().unwrap() + ); + + Ok(Outcome::success(format!( + "successfully deployed crd-prometheus {:#?}", + prom + ))) + } + + async fn install_alert_manager( + &self, + client: &Arc, + ) -> Result { + let am = Alertmanager { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: AlertmanagerSpec::default(), + }; + client + .apply(&am, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed service monitor {:#?}", + am.metadata.name + ))) + } + async fn install_monitors( + &self, + mut monitors: Vec, + client: &Arc, + ) -> Result { + let default_service_monitor = ServiceMonitor { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([ + ("alertmanagerConfig".to_string(), "enabled".to_string()), + ("client".to_string(), "prometheus".to_string()), + ( + "app.kubernetes.io/name".to_string(), + "kube-state-metrics".to_string(), + ), + ])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: ServiceMonitorSpec::default(), + }; + monitors.push(default_service_monitor); + for monitor in monitors.iter() { + client + .apply(monitor, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + } + Ok(Outcome::success( + "succesfully deployed service monitors".to_string(), + )) + } + + async fn install_rules( + &self, + rules: &Vec, + client: &Arc, + ) -> Result { + let mut prom_rule_spec = PrometheusRuleSpec { + groups: rules.clone(), + }; + + let default_rules_group = RuleGroup { + name: format!("default-rules"), + rules: build_default_application_rules(), + }; + + prom_rule_spec.groups.push(default_rules_group); + let prom_rules = PrometheusRule { + metadata: ObjectMeta { + name: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([ + ("alertmanagerConfig".to_string(), "enabled".to_string()), + ("role".to_string(), "prometheus-rule".to_string()), + ])), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: prom_rule_spec, + }; + client + .apply(&prom_rules, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed rules {:#?}", + prom_rules.metadata.name + ))) + } + + async fn install_client_kube_metrics(&self) -> Result { + self.install_chart( + "oci://hub.nationtech.io/harmony".to_string(), + "nt-kube-metrics".to_string(), + ) + .await?; + Ok(Outcome::success(format!( + "Installed client kube metrics in ns {}", + &self.sender.namespace.clone() + ))) + } + + async fn install_grafana(&self, client: &Arc) -> Result { + let mut label = BTreeMap::new(); + label.insert("dashboards".to_string(), "grafana".to_string()); + let labels = LabelSelector { + match_labels: label.clone(), + match_expressions: vec![], + }; + let mut json_data = BTreeMap::new(); + json_data.insert("timeInterval".to_string(), "5s".to_string()); + let namespace = self.sender.namespace.clone(); + + let json = build_default_dashboard(&namespace); + + let graf_data_source = GrafanaDatasource { + metadata: ObjectMeta { + name: Some(format!( + "grafana-datasource-{}", + self.sender.namespace.clone() + )), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: GrafanaDatasourceSpec { + instance_selector: labels.clone(), + allow_cross_namespace_import: Some(false), + datasource: GrafanaDatasourceConfig { + access: "proxy".to_string(), + database: Some("prometheus".to_string()), + json_data: Some(json_data), + //this is fragile + name: format!("prometheus-{}-0", self.sender.namespace.clone()), + r#type: "prometheus".to_string(), + url: format!( + "http://prometheus-operated.{}.svc.cluster.local:9090", + self.sender.namespace.clone() + ), + }, + }, + }; + + client + .apply(&graf_data_source, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + let graf_dashboard = GrafanaDashboard { + metadata: ObjectMeta { + name: Some(format!( + "grafana-dashboard-{}", + self.sender.namespace.clone() + )), + namespace: Some(self.sender.namespace.clone()), + ..Default::default() + }, + spec: GrafanaDashboardSpec { + resync_period: Some("30s".to_string()), + instance_selector: labels.clone(), + json, + }, + }; + + client + .apply(&graf_dashboard, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + let grafana = Grafana { + metadata: ObjectMeta { + name: Some(format!("grafana-{}", self.sender.namespace.clone())), + namespace: Some(self.sender.namespace.clone()), + labels: Some(label.clone()), + ..Default::default() + }, + spec: GrafanaSpec { + config: None, + admin_user: None, + admin_password: None, + ingress: None, + persistence: None, + resources: None, + }, + }; + client + .apply(&grafana, Some(&self.sender.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed grafana instance {:#?}", + grafana.metadata.name + ))) + } + + async fn install_receivers( + &self, + sender: &CRDPrometheus, + receivers: &Vec>>, + ) -> Result { + for receiver in receivers.iter() { + receiver.install(sender).await.map_err(|err| { + InterpretError::new(format!("failed to install receiver: {}", err)) + })?; + } + Ok(Outcome::success("successfully deployed receivers".into())) + } +} diff --git a/harmony/src/modules/prometheus/mod.rs b/harmony/src/modules/prometheus/mod.rs index 3a0c1d7..a59eadc 100644 --- a/harmony/src/modules/prometheus/mod.rs +++ b/harmony/src/modules/prometheus/mod.rs @@ -1 +1,3 @@ pub mod alerts; +pub mod k8s_prometheus_alerting_score; +pub mod prometheus; diff --git a/harmony/src/modules/prometheus/prometheus.rs b/harmony/src/modules/prometheus/prometheus.rs new file mode 100644 index 0000000..865b9ca --- /dev/null +++ b/harmony/src/modules/prometheus/prometheus.rs @@ -0,0 +1,17 @@ +use async_trait::async_trait; + +use crate::{ + interpret::{InterpretError, Outcome}, + inventory::Inventory, + topology::oberservability::monitoring::{AlertReceiver, AlertSender}, +}; + +#[async_trait] +pub trait PrometheusApplicationMonitoring { + async fn install_prometheus( + &self, + sender: &S, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result; +}