feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example #67
| @ -43,14 +43,14 @@ async fn main() { | |||||||
|     // K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
 |     // K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
 | ||||||
|     // locally, to development environment from a CI, to staging, and to production with settings
 |     // locally, to development environment from a CI, to staging, and to production with settings
 | ||||||
|     // that automatically adapt to each environment grade.
 |     // that automatically adapt to each environment grade.
 | ||||||
|     let maestro = Maestro::<K8sAnywhereTopology>::initialize( |     let mut maestro = Maestro::<K8sAnywhereTopology>::initialize( | ||||||
|         Inventory::autoload(), |         Inventory::autoload(), | ||||||
|         K8sAnywhereTopology::from_env(), |         K8sAnywhereTopology::from_env(), | ||||||
|     ) |     ) | ||||||
|     .await |     .await | ||||||
|     .unwrap(); |     .unwrap(); | ||||||
| 
 | 
 | ||||||
|     // maestro.register_all(vec![Box::new(lamp_stack)]);
 |     maestro.register_all(vec![Box::new(lamp_stack)]); | ||||||
|     // Here we bootstrap the CLI, this gives some nice features if you need them
 |     // Here we bootstrap the CLI, this gives some nice features if you need them
 | ||||||
|     harmony_cli::init(maestro, None).await.unwrap(); |     harmony_cli::init(maestro, None).await.unwrap(); | ||||||
| } | } | ||||||
|  | |||||||
| @ -1,10 +1,20 @@ | |||||||
| use harmony::{ | use harmony::{ | ||||||
|     inventory::Inventory, |     inventory::Inventory, | ||||||
|     maestro::Maestro, |     maestro::Maestro, | ||||||
|     modules::monitoring::{ |     modules::{ | ||||||
|  |         monitoring::{ | ||||||
|             alert_channel::discord_alert_channel::DiscordWebhook, |             alert_channel::discord_alert_channel::DiscordWebhook, | ||||||
|  |             alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, | ||||||
|             kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore, |             kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore, | ||||||
|         }, |         }, | ||||||
|  |         prometheus::alerts::{ | ||||||
|  |             infra::dell_server::{ | ||||||
|  |                 alert_global_storage_status_critical, alert_global_storage_status_non_recoverable, | ||||||
|  |                 global_storage_status_degraded_non_critical, | ||||||
|  |             }, | ||||||
|  |             k8s::pvc::high_pvc_fill_rate_over_two_days, | ||||||
|  |         }, | ||||||
|  |     }, | ||||||
|     topology::{K8sAnywhereTopology, Url}, |     topology::{K8sAnywhereTopology, Url}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| @ -12,10 +22,28 @@ use harmony::{ | |||||||
| async fn main() { | async fn main() { | ||||||
|     let discord_receiver = DiscordWebhook { |     let discord_receiver = DiscordWebhook { | ||||||
|         name: "test-discord".to_string(), |         name: "test-discord".to_string(), | ||||||
|         url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()), |         url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), | ||||||
|     }; |     }; | ||||||
|  | 
 | ||||||
|  |     let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days(); | ||||||
|  |     let dell_system_storage_degraded = global_storage_status_degraded_non_critical(); | ||||||
|  |     let alert_global_storage_status_critical = alert_global_storage_status_critical(); | ||||||
|  |     let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable(); | ||||||
|  | 
 | ||||||
|  |     let additional_rules = | ||||||
|  |         AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]); | ||||||
|  |     let additional_rules2 = AlertManagerRuleGroup::new( | ||||||
|  |         "dell-server-alerts", | ||||||
|  |         vec![ | ||||||
|  |             dell_system_storage_degraded, | ||||||
|  |             alert_global_storage_status_critical, | ||||||
|  |             alert_global_storage_status_non_recoverable, | ||||||
|  |         ], | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|     let alerting_score = HelmPrometheusAlertingScore { |     let alerting_score = HelmPrometheusAlertingScore { | ||||||
|         receivers: vec![Box::new(discord_receiver)], |         receivers: vec![Box::new(discord_receiver)], | ||||||
|  |         rules: vec![Box::new(additional_rules), Box::new(additional_rules2)], | ||||||
|     }; |     }; | ||||||
|     let mut maestro = Maestro::<K8sAnywhereTopology>::initialize( |     let mut maestro = Maestro::<K8sAnywhereTopology>::initialize( | ||||||
|         Inventory::autoload(), |         Inventory::autoload(), | ||||||
|  | |||||||
| @ -1,10 +1,11 @@ | |||||||
| use async_trait::async_trait; | use async_trait::async_trait; | ||||||
|  | use log::debug; | ||||||
| 
 | 
 | ||||||
| use crate::{ | use crate::{ | ||||||
|     data::{Id, Version}, |     data::{Id, Version}, | ||||||
|     interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, |     interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, | ||||||
|     inventory::Inventory, |     inventory::Inventory, | ||||||
|     topology::{HelmCommand, Topology, installable::Installable}, |     topology::{Topology, installable::Installable}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| @ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug { | |||||||
| pub struct AlertingInterpret<S: AlertSender> { | pub struct AlertingInterpret<S: AlertSender> { | ||||||
|     pub sender: S, |     pub sender: S, | ||||||
|     pub receivers: Vec<Box<dyn AlertReceiver<S>>>, |     pub receivers: Vec<Box<dyn AlertReceiver<S>>>, | ||||||
|  |     pub rules: Vec<Box<dyn AlertRule<S>>>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| @ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte | |||||||
|         for receiver in self.receivers.iter() { |         for receiver in self.receivers.iter() { | ||||||
|             receiver.install(&self.sender).await?; |             receiver.install(&self.sender).await?; | ||||||
|         } |         } | ||||||
|  |         for rule in self.rules.iter() { | ||||||
|  |             debug!("installing rule: {:#?}", rule); | ||||||
|  |             rule.install(&self.sender).await?; | ||||||
|  |         } | ||||||
|         self.sender.ensure_installed(inventory, topology).await?; |         self.sender.ensure_installed(inventory, topology).await?; | ||||||
|         Ok(Outcome::success(format!( |         Ok(Outcome::success(format!( | ||||||
|             "successfully installed alert sender {}", |             "successfully installed alert sender {}", | ||||||
| @ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| pub trait AlertRule<S: AlertSender> { | pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync { | ||||||
|     async fn install(&self, sender: &S) -> Result<(), InterpretError>; |     async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>; | ||||||
|  |     fn clone_box(&self) -> Box<dyn AlertRule<S>>; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
|  | |||||||
| @ -12,5 +12,6 @@ pub mod load_balancer; | |||||||
| pub mod monitoring; | pub mod monitoring; | ||||||
| pub mod okd; | pub mod okd; | ||||||
| pub mod opnsense; | pub mod opnsense; | ||||||
|  | pub mod prometheus; | ||||||
| pub mod tenant; | pub mod tenant; | ||||||
| pub mod tftp; | pub mod tftp; | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								harmony/src/modules/monitoring/alert_rule/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								harmony/src/modules/monitoring/alert_rule/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pub mod prometheus_alert_rule; | ||||||
| @ -0,0 +1,99 @@ | |||||||
|  | use std::collections::{BTreeMap, HashMap}; | ||||||
|  | 
 | ||||||
|  | use async_trait::async_trait; | ||||||
|  | use serde::Serialize; | ||||||
|  | 
 | ||||||
|  | use crate::{ | ||||||
|  |     interpret::{InterpretError, Outcome}, | ||||||
|  |     modules::monitoring::kube_prometheus::{ | ||||||
|  |         prometheus::{Prometheus, PrometheusRule}, | ||||||
|  |         types::{AlertGroup, AlertManagerAdditionalPromRules}, | ||||||
|  |     }, | ||||||
|  |     topology::oberservability::monitoring::AlertRule, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | #[async_trait] | ||||||
|  | impl AlertRule<Prometheus> for AlertManagerRuleGroup { | ||||||
|  |     async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> { | ||||||
|  |         sender.install_rule(&self).await | ||||||
|  |     } | ||||||
|  |     fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> { | ||||||
|  |         Box::new(self.clone()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[async_trait] | ||||||
|  | impl PrometheusRule for AlertManagerRuleGroup { | ||||||
|  |     fn name(&self) -> String { | ||||||
|  |         self.name.clone() | ||||||
|  |     } | ||||||
|  |     async fn configure_rule(&self) -> AlertManagerAdditionalPromRules { | ||||||
|  |         let mut additional_prom_rules = BTreeMap::new(); | ||||||
|  | 
 | ||||||
|  |         additional_prom_rules.insert( | ||||||
|  |             self.name.clone(), | ||||||
|  |             AlertGroup { | ||||||
|  |                 groups: vec![self.clone()], | ||||||
|  |             }, | ||||||
|  |         ); | ||||||
|  |         AlertManagerAdditionalPromRules { | ||||||
|  |             rules: additional_prom_rules, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl AlertManagerRuleGroup { | ||||||
|  |     pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup { | ||||||
|  |         AlertManagerRuleGroup { | ||||||
|  |             name: name.to_string().to_lowercase(), | ||||||
|  |             rules, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Debug, Clone, Serialize)] | ||||||
|  | ///logical group of alert rules
 | ||||||
|  | ///evaluates to:
 | ||||||
|  | ///name:
 | ||||||
|  | ///  groups:
 | ||||||
|  | ///  - name: name
 | ||||||
|  | ///    rules: PrometheusAlertRule
 | ||||||
|  | pub struct AlertManagerRuleGroup { | ||||||
|  |     pub name: String, | ||||||
|  |     pub rules: Vec<PrometheusAlertRule>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Debug, Clone, Serialize)] | ||||||
|  | pub struct PrometheusAlertRule { | ||||||
|  |     pub alert: String, | ||||||
|  |     pub expr: String, | ||||||
|  |     #[serde(skip_serializing_if = "Option::is_none")] | ||||||
|  |     pub r#for: Option<String>, | ||||||
|  |     pub labels: HashMap<String, String>, | ||||||
|  |     pub annotations: HashMap<String, String>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl PrometheusAlertRule { | ||||||
|  |     pub fn new(alert_name: &str, expr: &str) -> Self { | ||||||
|  |         Self { | ||||||
|  |             alert: alert_name.into(), | ||||||
|  |             expr: expr.into(), | ||||||
|  |             r#for: Some("1m".into()), | ||||||
|  |             labels: HashMap::new(), | ||||||
|  |             annotations: HashMap::new(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     pub fn for_duration(mut self, duration: &str) -> Self { | ||||||
|  |         self.r#for = Some(duration.into()); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |     pub fn label(mut self, key: &str, value: &str) -> Self { | ||||||
|  |         self.labels.insert(key.into(), value.into()); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     pub fn annotation(mut self, key: &str, value: &str) -> Self { | ||||||
|  |         self.annotations.insert(key.into(), value.into()); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -1,6 +1,9 @@ | |||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| 
 | 
 | ||||||
| use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig; | use crate::modules::monitoring::{ | ||||||
|  |     alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, | ||||||
|  |     kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, | ||||||
|  | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Debug, Clone, Serialize)] | #[derive(Debug, Clone, Serialize)] | ||||||
| pub struct KubePrometheusConfig { | pub struct KubePrometheusConfig { | ||||||
| @ -22,6 +25,7 @@ pub struct KubePrometheusConfig { | |||||||
|     pub kube_state_metrics: bool, |     pub kube_state_metrics: bool, | ||||||
|     pub prometheus_operator: bool, |     pub prometheus_operator: bool, | ||||||
|     pub alert_receiver_configs: Vec<AlertManagerChannelConfig>, |     pub alert_receiver_configs: Vec<AlertManagerChannelConfig>, | ||||||
|  |     pub alert_rules: Vec<AlertManagerAdditionalPromRules>, | ||||||
| } | } | ||||||
| impl KubePrometheusConfig { | impl KubePrometheusConfig { | ||||||
|     pub fn new() -> Self { |     pub fn new() -> Self { | ||||||
| @ -44,6 +48,7 @@ impl KubePrometheusConfig { | |||||||
|             core_dns: false, |             core_dns: false, | ||||||
|             kube_scheduler: false, |             kube_scheduler: false, | ||||||
|             alert_receiver_configs: vec![], |             alert_receiver_configs: vec![], | ||||||
|  |             alert_rules: vec![], | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | |||||||
| @ -3,6 +3,7 @@ use log::debug; | |||||||
| use non_blank_string_rs::NonBlankString; | use non_blank_string_rs::NonBlankString; | ||||||
| use serde_yaml::{Mapping, Value}; | use serde_yaml::{Mapping, Value}; | ||||||
| use std::{ | use std::{ | ||||||
|  |     collections::BTreeMap, | ||||||
|     str::FromStr, |     str::FromStr, | ||||||
|     sync::{Arc, Mutex}, |     sync::{Arc, Mutex}, | ||||||
| }; | }; | ||||||
| @ -10,7 +11,8 @@ use std::{ | |||||||
| use crate::modules::{ | use crate::modules::{ | ||||||
|     helm::chart::HelmChartScore, |     helm::chart::HelmChartScore, | ||||||
|     monitoring::kube_prometheus::types::{ |     monitoring::kube_prometheus::types::{ | ||||||
|         AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues, |         AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig, | ||||||
|  |         AlertManagerRoute, AlertManagerValues, | ||||||
|     }, |     }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| @ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score( | |||||||
|     config: Arc<Mutex<KubePrometheusConfig>>, |     config: Arc<Mutex<KubePrometheusConfig>>, | ||||||
| ) -> HelmChartScore { | ) -> HelmChartScore { | ||||||
|     let config = config.lock().unwrap(); |     let config = config.lock().unwrap(); | ||||||
|     //TODO this should be make into a rule with default formatting that can be easily passed as a vec
 | 
 | ||||||
|     //to the overrides or something leaving the user to deal with formatting here seems bad
 |  | ||||||
|     let default_rules = config.default_rules.to_string(); |     let default_rules = config.default_rules.to_string(); | ||||||
|     let windows_monitoring = config.windows_monitoring.to_string(); |     let windows_monitoring = config.windows_monitoring.to_string(); | ||||||
|     let grafana = config.grafana.to_string(); |     let grafana = config.grafana.to_string(); | ||||||
|     let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); |     let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); | ||||||
|     let kubernetes_api_server = config.kubernetes_api_server.to_string(); |     let kubernetes_api_server = config.kubernetes_api_server.to_string(); | ||||||
|     let kubelet = config.kubelet.to_string(); |     let kubelet = config.kubelet.to_string(); | ||||||
|     let alert_manager = config.alert_manager.to_string(); |  | ||||||
|     let kube_controller_manager = config.kube_controller_manager.to_string(); |     let kube_controller_manager = config.kube_controller_manager.to_string(); | ||||||
|     let core_dns = config.core_dns.to_string(); |     let core_dns = config.core_dns.to_string(); | ||||||
|     let kube_etcd = config.kube_etcd.to_string(); |     let kube_etcd = config.kube_etcd.to_string(); | ||||||
| @ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score( | |||||||
|     let prometheus = config.prometheus.to_string(); |     let prometheus = config.prometheus.to_string(); | ||||||
|     let mut values = format!( |     let mut values = format!( | ||||||
|         r#" |         r#" | ||||||
| additionalPrometheusRulesMap: |  | ||||||
|   pods-status-alerts: |  | ||||||
|     groups: |  | ||||||
|       - name: pods |  | ||||||
|         rules: |  | ||||||
|           - alert: "[CRIT] POD not healthy" |  | ||||||
|             expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0 |  | ||||||
|             for: 0m |  | ||||||
|             labels: |  | ||||||
|               severity: critical |  | ||||||
|             annotations: |  | ||||||
|               title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}" |  | ||||||
|               description: | |  | ||||||
|                A POD is in a non-ready state! |  | ||||||
|                - **Pod**: {{{{ $labels.pod }}}} |  | ||||||
|                - **Namespace**: {{{{ $labels.namespace }}}} |  | ||||||
|           - alert: "[CRIT] POD crash looping" |  | ||||||
|             expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 |  | ||||||
|             for: 0m |  | ||||||
|             labels: |  | ||||||
|               severity: critical |  | ||||||
|             annotations: |  | ||||||
|               title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}" |  | ||||||
|               description: | |  | ||||||
|                A POD is drowning in a crash loop! |  | ||||||
|                - **Pod**: {{{{ $labels.pod }}}} |  | ||||||
|                - **Namespace**: {{{{ $labels.namespace }}}} |  | ||||||
|                - **Instance**: {{{{ $labels.instance }}}} |  | ||||||
|   pvc-alerts: |  | ||||||
|     groups: |  | ||||||
|       - name: pvc-alerts |  | ||||||
|         rules: |  | ||||||
|           - alert: 'PVC Fill Over 95 Percent In 2 Days' |  | ||||||
|             expr: | |  | ||||||
|               ( |  | ||||||
|                 kubelet_volume_stats_used_bytes |  | ||||||
|                 / |  | ||||||
|                 kubelet_volume_stats_capacity_bytes |  | ||||||
|               ) > 0.95 |  | ||||||
|               AND |  | ||||||
|               predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60) |  | ||||||
|               / |  | ||||||
|               kubelet_volume_stats_capacity_bytes |  | ||||||
|               > 0.95 |  | ||||||
|             for: 1m |  | ||||||
|             labels: |  | ||||||
|               severity: warning |  | ||||||
|             annotations: |  | ||||||
|               description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days. |  | ||||||
|               title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days |  | ||||||
| defaultRules: | defaultRules: | ||||||
|   create: {default_rules} |   create: {default_rules} | ||||||
|   rules: |   rules: | ||||||
| @ -156,6 +106,7 @@ prometheus: | |||||||
| "#,
 | "#,
 | ||||||
|     ); |     ); | ||||||
| 
 | 
 | ||||||
|  |     // add required null receiver for prometheus alert manager
 | ||||||
|     let mut null_receiver = Mapping::new(); |     let mut null_receiver = Mapping::new(); | ||||||
|     null_receiver.insert( |     null_receiver.insert( | ||||||
|         Value::String("receiver".to_string()), |         Value::String("receiver".to_string()), | ||||||
| @ -167,6 +118,7 @@ prometheus: | |||||||
|     ); |     ); | ||||||
|     null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); |     null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); | ||||||
| 
 | 
 | ||||||
|  |     //add alert channels
 | ||||||
|     let mut alert_manager_channel_config = AlertManagerConfig { |     let mut alert_manager_channel_config = AlertManagerConfig { | ||||||
|         global: Mapping::new(), |         global: Mapping::new(), | ||||||
|         route: AlertManagerRoute { |         route: AlertManagerRoute { | ||||||
| @ -200,7 +152,38 @@ prometheus: | |||||||
|         serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML"); |         serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML"); | ||||||
|     debug!("serialized alert manager: \n {:#}", alert_manager_yaml); |     debug!("serialized alert manager: \n {:#}", alert_manager_yaml); | ||||||
|     values.push_str(&alert_manager_yaml); |     values.push_str(&alert_manager_yaml); | ||||||
|  | 
 | ||||||
|  |     //format alert manager additional rules for helm chart
 | ||||||
|  |     let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new(); | ||||||
|  | 
 | ||||||
|  |     for additional_rule in config.alert_rules.clone() { | ||||||
|  |         for (key, group) in additional_rule.rules { | ||||||
|  |             merged_rules.insert(key, group); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     let merged_rules = AlertManagerAdditionalPromRules { | ||||||
|  |         rules: merged_rules, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     let mut alert_manager_additional_rules = serde_yaml::Mapping::new(); | ||||||
|  |     let rules_value = serde_yaml::to_value(merged_rules).unwrap(); | ||||||
|  | 
 | ||||||
|  |     alert_manager_additional_rules.insert( | ||||||
|  |         serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()), | ||||||
|  |         rules_value, | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     let alert_manager_additional_rules_yaml = | ||||||
|  |         serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML"); | ||||||
|  |     debug!( | ||||||
|  |         "alert_rules_yaml:\n{:#}", | ||||||
|  |         alert_manager_additional_rules_yaml | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     values.push_str(&alert_manager_additional_rules_yaml); | ||||||
|     debug!("full values.yaml: \n {:#}", values); |     debug!("full values.yaml: \n {:#}", values); | ||||||
|  | 
 | ||||||
|     HelmChartScore { |     HelmChartScore { | ||||||
|         namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), |         namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), | ||||||
|         release_name: NonBlankString::from_str("kube-prometheus").unwrap(), |         release_name: NonBlankString::from_str("kube-prometheus").unwrap(), | ||||||
|  | |||||||
| @ -2,19 +2,19 @@ use std::sync::{Arc, Mutex}; | |||||||
| 
 | 
 | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| 
 | 
 | ||||||
|  | use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus}; | ||||||
| use crate::{ | use crate::{ | ||||||
|     score::Score, |     score::Score, | ||||||
|     topology::{ |     topology::{ | ||||||
|         HelmCommand, Topology, |         HelmCommand, Topology, | ||||||
|         oberservability::monitoring::{AlertReceiver, AlertingInterpret}, |         oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, | ||||||
|     }, |     }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus}; |  | ||||||
| 
 |  | ||||||
| #[derive(Clone, Debug, Serialize)] | #[derive(Clone, Debug, Serialize)] | ||||||
| pub struct HelmPrometheusAlertingScore { | pub struct HelmPrometheusAlertingScore { | ||||||
|     pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>, |     pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>, | ||||||
|  |     pub rules: Vec<Box<dyn AlertRule<Prometheus>>>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore { | impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore { | ||||||
| @ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore { | |||||||
|                 config: Arc::new(Mutex::new(KubePrometheusConfig::new())), |                 config: Arc::new(Mutex::new(KubePrometheusConfig::new())), | ||||||
|             }, |             }, | ||||||
|             receivers: self.receivers.clone(), |             receivers: self.receivers.clone(), | ||||||
|  |             rules: self.rules.clone(), | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|     fn name(&self) -> String { |     fn name(&self) -> String { | ||||||
|         "HelmPrometheusAlertingScore".to_string() |         "HelmPrometheusAlertingScore".to_string() | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 |  | ||||||
| impl Serialize for Box<dyn AlertReceiver<Prometheus>> { |  | ||||||
|     fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error> |  | ||||||
|     where |  | ||||||
|         S: serde::Serializer, |  | ||||||
|     { |  | ||||||
|         todo!() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| impl Clone for Box<dyn AlertReceiver<Prometheus>> { |  | ||||||
|     fn clone(&self) -> Self { |  | ||||||
|         self.clone_box() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  | |||||||
| @ -2,13 +2,17 @@ use std::sync::{Arc, Mutex}; | |||||||
| 
 | 
 | ||||||
| use async_trait::async_trait; | use async_trait::async_trait; | ||||||
| use log::debug; | use log::debug; | ||||||
|  | use serde::Serialize; | ||||||
| 
 | 
 | ||||||
| use crate::{ | use crate::{ | ||||||
|     interpret::{InterpretError, Outcome}, |     interpret::{InterpretError, Outcome}, | ||||||
|     inventory::Inventory, |     inventory::Inventory, | ||||||
|  |     modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, | ||||||
|     score, |     score, | ||||||
|     topology::{ |     topology::{ | ||||||
|         HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender, |         HelmCommand, Topology, | ||||||
|  |         installable::Installable, | ||||||
|  |         oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, | ||||||
|     }, |     }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| @ -18,7 +22,7 @@ use super::{ | |||||||
|     helm::{ |     helm::{ | ||||||
|         config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score, |         config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score, | ||||||
|     }, |     }, | ||||||
|     types::AlertManagerChannelConfig, |     types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| @ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus { | |||||||
|         inventory: &Inventory, |         inventory: &Inventory, | ||||||
|         topology: &T, |         topology: &T, | ||||||
|     ) -> Result<(), InterpretError> { |     ) -> Result<(), InterpretError> { | ||||||
|         //install_prometheus
 |  | ||||||
|         self.install_prometheus(inventory, topology).await?; |         self.install_prometheus(inventory, topology).await?; | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| @ -67,6 +70,20 @@ impl Prometheus { | |||||||
|         ))) |         ))) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     pub async fn install_rule( | ||||||
|  |         &self, | ||||||
|  |         prometheus_rule: &AlertManagerRuleGroup, | ||||||
|  |     ) -> Result<Outcome, InterpretError> { | ||||||
|  |         let prometheus_rule = prometheus_rule.configure_rule().await; | ||||||
|  |         let mut config = self.config.lock().unwrap(); | ||||||
|  | 
 | ||||||
|  |         config.alert_rules.push(prometheus_rule.clone()); | ||||||
|  |         Ok(Outcome::success(format!( | ||||||
|  |             "Successfully installed alert rule: {:#?},", | ||||||
|  |             prometheus_rule | ||||||
|  |         ))) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>( |     pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>( | ||||||
|         &self, |         &self, | ||||||
|         inventory: &Inventory, |         inventory: &Inventory, | ||||||
| @ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug { | |||||||
|     fn name(&self) -> String; |     fn name(&self) -> String; | ||||||
|     async fn configure_receiver(&self) -> AlertManagerChannelConfig; |     async fn configure_receiver(&self) -> AlertManagerChannelConfig; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | impl Serialize for Box<dyn AlertReceiver<Prometheus>> { | ||||||
|  |     fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error> | ||||||
|  |     where | ||||||
|  |         S: serde::Serializer, | ||||||
|  |     { | ||||||
|  |         todo!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl Clone for Box<dyn AlertReceiver<Prometheus>> { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         self.clone_box() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[async_trait] | ||||||
|  | pub trait PrometheusRule: Send + Sync + std::fmt::Debug { | ||||||
|  |     fn name(&self) -> String; | ||||||
|  |     async fn configure_rule(&self) -> AlertManagerAdditionalPromRules; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl Serialize for Box<dyn AlertRule<Prometheus>> { | ||||||
|  |     fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error> | ||||||
|  |     where | ||||||
|  |         S: serde::Serializer, | ||||||
|  |     { | ||||||
|  |         todo!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl Clone for Box<dyn AlertRule<Prometheus>> { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         self.clone_box() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | |||||||
| @ -1,7 +1,11 @@ | |||||||
|  | use std::collections::BTreeMap; | ||||||
|  | 
 | ||||||
| use async_trait::async_trait; | use async_trait::async_trait; | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| use serde_yaml::{Mapping, Sequence, Value}; | use serde_yaml::{Mapping, Sequence, Value}; | ||||||
| 
 | 
 | ||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; | ||||||
|  | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| pub trait AlertChannelConfig { | pub trait AlertChannelConfig { | ||||||
|     async fn get_config(&self) -> AlertManagerChannelConfig; |     async fn get_config(&self) -> AlertManagerChannelConfig; | ||||||
| @ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig { | |||||||
|     pub channel_route: Value, |     pub channel_route: Value, | ||||||
|     pub channel_receiver: Value, |     pub channel_receiver: Value, | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | #[derive(Debug, Clone, Serialize)] | ||||||
|  | pub struct AlertManagerAdditionalPromRules { | ||||||
|  |     #[serde(flatten)] | ||||||
|  |     pub rules: BTreeMap<String, AlertGroup>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Debug, Clone, Serialize)] | ||||||
|  | pub struct AlertGroup { | ||||||
|  |     pub groups: Vec<AlertManagerRuleGroup>, | ||||||
|  | } | ||||||
|  | |||||||
| @ -1,2 +1,3 @@ | |||||||
| pub mod alert_channel; | pub mod alert_channel; | ||||||
|  | pub mod alert_rule; | ||||||
| pub mod kube_prometheus; | pub mod kube_prometheus; | ||||||
|  | |||||||
							
								
								
									
										40
									
								
								harmony/src/modules/prometheus/alerts/infra/dell_server.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								harmony/src/modules/prometheus/alerts/infra/dell_server.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,40 @@ | |||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4") | ||||||
|  |         .for_duration("5m") | ||||||
|  |         .label("severity", "warning") | ||||||
|  |         .annotation( | ||||||
|  |             "description", | ||||||
|  |             "- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", | ||||||
|  |         ) | ||||||
|  |         .annotation("title", " System storage status is in degraded state") | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn alert_global_storage_status_critical() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new( | ||||||
|  |         "GlobalStorageStatus critical", | ||||||
|  |         "globalStorageStatus == 5", | ||||||
|  |     ) | ||||||
|  |     .for_duration("5m") | ||||||
|  |     .label("severity", "warning") | ||||||
|  |     .annotation("title", "System storage status is critical at {{ $labels.instance }}") | ||||||
|  |     .annotation( | ||||||
|  |         "description", | ||||||
|  |         "- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", | ||||||
|  |     ) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new( | ||||||
|  |         "GlobalStorageStatus nonRecoverable", | ||||||
|  |         "globalStorageStatus == 6", | ||||||
|  |     ) | ||||||
|  |     .for_duration("5m") | ||||||
|  |     .label("severity", "warning") | ||||||
|  |     .annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}") | ||||||
|  |     .annotation( | ||||||
|  |         "description", | ||||||
|  |         "- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}", | ||||||
|  |     ) | ||||||
|  | } | ||||||
							
								
								
									
										1
									
								
								harmony/src/modules/prometheus/alerts/infra/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								harmony/src/modules/prometheus/alerts/infra/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pub mod dell_server; | ||||||
							
								
								
									
										1
									
								
								harmony/src/modules/prometheus/alerts/k8s/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								harmony/src/modules/prometheus/alerts/k8s/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pub mod pvc; | ||||||
							
								
								
									
										11
									
								
								harmony/src/modules/prometheus/alerts/k8s/pvc.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								harmony/src/modules/prometheus/alerts/k8s/pvc.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | |||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new( | ||||||
|  |         "PVC Fill Over 95 Percent In 2 Days", | ||||||
|  |         "(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",) | ||||||
|  |         .for_duration("1m") | ||||||
|  |         .label("severity", "warning") | ||||||
|  |         .annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.") | ||||||
|  |         .annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",) | ||||||
|  | } | ||||||
							
								
								
									
										2
									
								
								harmony/src/modules/prometheus/alerts/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								harmony/src/modules/prometheus/alerts/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,2 @@ | |||||||
|  | pub mod infra; | ||||||
|  | pub mod k8s; | ||||||
							
								
								
									
										1
									
								
								harmony/src/modules/prometheus/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								harmony/src/modules/prometheus/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | pub mod alerts; | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user