diff --git a/examples/monitoring/src/main.rs b/examples/monitoring/src/main.rs index c0fcf33..0f7fc37 100644 --- a/examples/monitoring/src/main.rs +++ b/examples/monitoring/src/main.rs @@ -3,7 +3,18 @@ use harmony::{ maestro::Maestro, modules::monitoring::{ alert_channel::discord_alert_channel::DiscordWebhook, - kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore, + alert_rule::prometheus_alert_rule::{AlertManagerRuleGroup, PrometheusAlertRule}, + kube_prometheus::{ + alerts::{ + dell_server::{ + alert_global_storage_status_critical, + alert_global_storage_status_non_recoverable, + global_storage_status_degraded_non_critical, + }, + pvc::high_pvc_fill_rate_over_two_days, + }, + helm_prometheus_alert_score::HelmPrometheusAlertingScore, + }, }, topology::{K8sAnywhereTopology, Url}, }; @@ -12,10 +23,28 @@ use harmony::{ async fn main() { let discord_receiver = DiscordWebhook { name: "test-discord".to_string(), - url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()), + url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), }; + + let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days(); + let dell_system_storage_degraded = global_storage_status_degraded_non_critical(); + let alert_global_storage_status_critical = alert_global_storage_status_critical(); + let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable(); + + let additional_rules = + AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]); + let additional_rules2 = AlertManagerRuleGroup::new( + "dell-server-alerts", + vec![ + dell_system_storage_degraded, + alert_global_storage_status_critical, + alert_global_storage_status_non_recoverable, + ], + ); + let alerting_score = HelmPrometheusAlertingScore { receivers: vec![Box::new(discord_receiver)], + rules: vec![Box::new(additional_rules), Box::new(additional_rules2)], }; let mut maestro = Maestro::::initialize( Inventory::autoload(), diff --git a/harmony/src/domain/topology/oberservability/monitoring.rs b/harmony/src/domain/topology/oberservability/monitoring.rs index 7d65bf2..ed7e936 100644 --- a/harmony/src/domain/topology/oberservability/monitoring.rs +++ b/harmony/src/domain/topology/oberservability/monitoring.rs @@ -1,10 +1,11 @@ use async_trait::async_trait; +use log::debug; use crate::{ data::{Id, Version}, interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, inventory::Inventory, - topology::{HelmCommand, Topology, installable::Installable}, + topology::{Topology, installable::Installable}, }; #[async_trait] @@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug { pub struct AlertingInterpret { pub sender: S, pub receivers: Vec>>, + pub rules: Vec>>, } #[async_trait] @@ -28,6 +30,10 @@ impl, T: Topology> Interpret for AlertingInte for receiver in self.receivers.iter() { receiver.install(&self.sender).await?; } + for rule in self.rules.iter() { + debug!("installing rule: {:#?}", rule); + rule.install(&self.sender).await?; + } self.sender.ensure_installed(inventory, topology).await?; Ok(Outcome::success(format!( "successfully installed alert sender {}", @@ -59,8 +65,9 @@ pub trait AlertReceiver: std::fmt::Debug + Send + Sync { } #[async_trait] -pub trait AlertRule { - async fn install(&self, sender: &S) -> Result<(), InterpretError>; +pub trait AlertRule: std::fmt::Debug + Send + Sync { + async fn install(&self, sender: &S) -> Result; + fn clone_box(&self) -> Box>; } #[async_trait] diff --git a/harmony/src/modules/monitoring/alert_rule/mod.rs b/harmony/src/modules/monitoring/alert_rule/mod.rs new file mode 100644 index 0000000..846c769 --- /dev/null +++ b/harmony/src/modules/monitoring/alert_rule/mod.rs @@ -0,0 +1 @@ +pub mod prometheus_alert_rule; diff --git a/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs b/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs new file mode 100644 index 0000000..ccb63cc --- /dev/null +++ b/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs @@ -0,0 +1,99 @@ +use std::collections::{BTreeMap, HashMap}; + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + interpret::{InterpretError, Outcome}, + modules::monitoring::kube_prometheus::{ + prometheus::{Prometheus, PrometheusRule}, + types::{AlertGroup, AlertManagerAdditionalPromRules}, + }, + topology::oberservability::monitoring::AlertRule, +}; + +#[async_trait] +impl AlertRule for AlertManagerRuleGroup { + async fn install(&self, sender: &Prometheus) -> Result { + sender.install_rule(&self).await + } + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } +} + +#[async_trait] +impl PrometheusRule for AlertManagerRuleGroup { + fn name(&self) -> String { + self.name.clone() + } + async fn configure_rule(&self) -> AlertManagerAdditionalPromRules { + let mut additional_prom_rules = BTreeMap::new(); + + additional_prom_rules.insert( + self.name.clone(), + AlertGroup { + groups: vec![self.clone()], + }, + ); + AlertManagerAdditionalPromRules { + rules: additional_prom_rules, + } + } +} + +impl AlertManagerRuleGroup { + pub fn new(name: &str, rules: Vec) -> AlertManagerRuleGroup { + AlertManagerRuleGroup { + name: name.to_string().to_lowercase(), + rules, + } + } +} + +#[derive(Debug, Clone, Serialize)] +///logical group of alert rules +///evaluates to: +///name: +/// groups: +/// - name: name +/// rules: PrometheusAlertRule +pub struct AlertManagerRuleGroup { + pub name: String, + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusAlertRule { + pub alert: String, + pub expr: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub r#for: Option, + pub labels: HashMap, + pub annotations: HashMap, +} + +impl PrometheusAlertRule { + pub fn new(alert_name: &str, expr: &str) -> Self { + Self { + alert: alert_name.into(), + expr: expr.into(), + r#for: Some("1m".into()), + labels: HashMap::new(), + annotations: HashMap::new(), + } + } + pub fn for_duration(mut self, duration: &str) -> Self { + self.r#for = Some(duration.into()); + self + } + pub fn label(mut self, key: &str, value: &str) -> Self { + self.labels.insert(key.into(), value.into()); + self + } + + pub fn annotation(mut self, key: &str, value: &str) -> Self { + self.annotations.insert(key.into(), value.into()); + self + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/alerts/dell_server.rs b/harmony/src/modules/monitoring/kube_prometheus/alerts/dell_server.rs new file mode 100644 index 0000000..1092c4c --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/alerts/dell_server.rs @@ -0,0 +1,40 @@ +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule { + PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4") + .for_duration("5m") + .label("severity", "warning") + .annotation( + "description", + "- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", + ) + .annotation("title", " System storage status is in degraded state") +} + +pub fn alert_global_storage_status_critical() -> PrometheusAlertRule { + PrometheusAlertRule::new( + "GlobalStorageStatus critical", + "globalStorageStatus == 5", + ) + .for_duration("5m") + .label("severity", "warning") + .annotation("title", "System storage status is critical at {{ $labels.instance }}") + .annotation( + "description", + "- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", + ) +} + +pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule { + PrometheusAlertRule::new( + "GlobalStorageStatus nonRecoverable", + "globalStorageStatus == 6", + ) + .for_duration("5m") + .label("severity", "warning") + .annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}") + .annotation( + "description", + "- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", + ) +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/alerts/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/alerts/mod.rs new file mode 100644 index 0000000..50291ff --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/alerts/mod.rs @@ -0,0 +1,2 @@ +pub mod dell_server; +pub mod pvc; diff --git a/harmony/src/modules/monitoring/kube_prometheus/alerts/pvc.rs b/harmony/src/modules/monitoring/kube_prometheus/alerts/pvc.rs new file mode 100644 index 0000000..f99ee39 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/alerts/pvc.rs @@ -0,0 +1,11 @@ +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule { + PrometheusAlertRule::new( + "PVC Fill Over 95 Percent In 2 Days", + "(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",) + .for_duration("1m") + .label("severity", "warning") + .annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.") + .annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",) +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs b/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs index 741cd1b..ecbf8d8 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs @@ -1,6 +1,9 @@ use serde::Serialize; -use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig; +use crate::modules::monitoring::{ + alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, + kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, +}; #[derive(Debug, Clone, Serialize)] pub struct KubePrometheusConfig { @@ -22,6 +25,7 @@ pub struct KubePrometheusConfig { pub kube_state_metrics: bool, pub prometheus_operator: bool, pub alert_receiver_configs: Vec, + pub alert_rules: Vec, } impl KubePrometheusConfig { pub fn new() -> Self { @@ -44,6 +48,7 @@ impl KubePrometheusConfig { core_dns: false, kube_scheduler: false, alert_receiver_configs: vec![], + alert_rules: vec![], } } } diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs index 94440c0..843a677 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs @@ -3,6 +3,7 @@ use log::debug; use non_blank_string_rs::NonBlankString; use serde_yaml::{Mapping, Value}; use std::{ + collections::BTreeMap, str::FromStr, sync::{Arc, Mutex}, }; @@ -10,7 +11,8 @@ use std::{ use crate::modules::{ helm::chart::HelmChartScore, monitoring::kube_prometheus::types::{ - AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues, + AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig, + AlertManagerRoute, AlertManagerValues, }, }; @@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score( config: Arc>, ) -> HelmChartScore { let config = config.lock().unwrap(); - //TODO this should be make into a rule with default formatting that can be easily passed as a vec - //to the overrides or something leaving the user to deal with formatting here seems bad + let default_rules = config.default_rules.to_string(); let windows_monitoring = config.windows_monitoring.to_string(); let grafana = config.grafana.to_string(); let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); let kubernetes_api_server = config.kubernetes_api_server.to_string(); let kubelet = config.kubelet.to_string(); - let alert_manager = config.alert_manager.to_string(); let kube_controller_manager = config.kube_controller_manager.to_string(); let core_dns = config.core_dns.to_string(); let kube_etcd = config.kube_etcd.to_string(); @@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score( let prometheus = config.prometheus.to_string(); let mut values = format!( r#" -additionalPrometheusRulesMap: - pods-status-alerts: - groups: - - name: pods - rules: - - alert: "[CRIT] POD not healthy" - expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}" - description: | - A POD is in a non-ready state! - - **Pod**: {{{{ $labels.pod }}}} - - **Namespace**: {{{{ $labels.namespace }}}} - - alert: "[CRIT] POD crash looping" - expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 - for: 0m - labels: - severity: critical - annotations: - title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}" - description: | - A POD is drowning in a crash loop! - - **Pod**: {{{{ $labels.pod }}}} - - **Namespace**: {{{{ $labels.namespace }}}} - - **Instance**: {{{{ $labels.instance }}}} - pvc-alerts: - groups: - - name: pvc-alerts - rules: - - alert: 'PVC Fill Over 95 Percent In 2 Days' - expr: | - ( - kubelet_volume_stats_used_bytes - / - kubelet_volume_stats_capacity_bytes - ) > 0.95 - AND - predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60) - / - kubelet_volume_stats_capacity_bytes - > 0.95 - for: 1m - labels: - severity: warning - annotations: - description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days. - title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days defaultRules: create: {default_rules} rules: @@ -156,6 +106,7 @@ prometheus: "#, ); + // add required null receiver for prometheus alert manager let mut null_receiver = Mapping::new(); null_receiver.insert( Value::String("receiver".to_string()), @@ -167,6 +118,7 @@ prometheus: ); null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); + //add alert channels let mut alert_manager_channel_config = AlertManagerConfig { global: Mapping::new(), route: AlertManagerRoute { @@ -200,7 +152,38 @@ prometheus: serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML"); debug!("serialized alert manager: \n {:#}", alert_manager_yaml); values.push_str(&alert_manager_yaml); + + //format alert manager additional rules for helm chart + let mut merged_rules: BTreeMap = BTreeMap::new(); + + for additional_rule in config.alert_rules.clone() { + for (key, group) in additional_rule.rules { + merged_rules.insert(key, group); + } + } + + let merged_rules = AlertManagerAdditionalPromRules { + rules: merged_rules, + }; + + let mut alert_manager_additional_rules = serde_yaml::Mapping::new(); + let rules_value = serde_yaml::to_value(merged_rules).unwrap(); + + alert_manager_additional_rules.insert( + serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()), + rules_value, + ); + + let alert_manager_additional_rules_yaml = + serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML"); + debug!( + "alert_rules_yaml:\n{:#}", + alert_manager_additional_rules_yaml + ); + + values.push_str(&alert_manager_additional_rules_yaml); debug!("full values.yaml: \n {:#}", values); + HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), release_name: NonBlankString::from_str("kube-prometheus").unwrap(), diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs b/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs index f1f5322..8844309 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs @@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex}; use serde::Serialize; +use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus}; use crate::{ score::Score, topology::{ HelmCommand, Topology, - oberservability::monitoring::{AlertReceiver, AlertingInterpret}, + oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, }, }; -use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus}; - #[derive(Clone, Debug, Serialize)] pub struct HelmPrometheusAlertingScore { pub receivers: Vec>>, + pub rules: Vec>>, } impl Score for HelmPrometheusAlertingScore { @@ -24,24 +24,10 @@ impl Score for HelmPrometheusAlertingScore { config: Arc::new(Mutex::new(KubePrometheusConfig::new())), }, receivers: self.receivers.clone(), + rules: self.rules.clone(), }) } fn name(&self) -> String { "HelmPrometheusAlertingScore".to_string() } } - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} diff --git a/harmony/src/modules/monitoring/kube_prometheus/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/mod.rs index 7c8233a..a9180e9 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/mod.rs @@ -1,3 +1,4 @@ +pub mod alerts; pub mod helm; pub mod helm_prometheus_alert_score; pub mod prometheus; diff --git a/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs b/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs index 554d319..148f91c 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs @@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex}; use async_trait::async_trait; use log::debug; +use serde::Serialize; use crate::{ interpret::{InterpretError, Outcome}, inventory::Inventory, + modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, score, topology::{ - HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender, + HelmCommand, Topology, + installable::Installable, + oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, }, }; @@ -18,7 +22,7 @@ use super::{ helm::{ config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score, }, - types::AlertManagerChannelConfig, + types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, }; #[async_trait] @@ -35,7 +39,6 @@ impl Installable for Prometheus { inventory: &Inventory, topology: &T, ) -> Result<(), InterpretError> { - //install_prometheus self.install_prometheus(inventory, topology).await?; Ok(()) } @@ -67,6 +70,20 @@ impl Prometheus { ))) } + pub async fn install_rule( + &self, + prometheus_rule: &AlertManagerRuleGroup, + ) -> Result { + let prometheus_rule = prometheus_rule.configure_rule().await; + let mut config = self.config.lock().unwrap(); + + config.alert_rules.push(prometheus_rule.clone()); + Ok(Outcome::success(format!( + "Successfully installed alert rule: {:#?},", + prometheus_rule + ))) + } + pub async fn install_prometheus( &self, inventory: &Inventory, @@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug { fn name(&self) -> String; async fn configure_receiver(&self) -> AlertManagerChannelConfig; } + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +#[async_trait] +pub trait PrometheusRule: Send + Sync + std::fmt::Debug { + fn name(&self) -> String; + async fn configure_rule(&self) -> AlertManagerAdditionalPromRules; +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/types.rs b/harmony/src/modules/monitoring/kube_prometheus/types.rs index f237bba..878d527 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/types.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs @@ -1,7 +1,11 @@ +use std::collections::BTreeMap; + use async_trait::async_trait; use serde::Serialize; use serde_yaml::{Mapping, Sequence, Value}; +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; + #[async_trait] pub trait AlertChannelConfig { async fn get_config(&self) -> AlertManagerChannelConfig; @@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig { pub channel_route: Value, pub channel_receiver: Value, } + +#[derive(Debug, Clone, Serialize)] +pub struct AlertManagerAdditionalPromRules { + #[serde(flatten)] + pub rules: BTreeMap, +} + +#[derive(Debug, Clone, Serialize)] +pub struct AlertGroup { + pub groups: Vec, +} diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index 7cdb3a9..0e4f8a4 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,2 +1,3 @@ pub mod alert_channel; +pub mod alert_rule; pub mod kube_prometheus;