From e80752ea3fc6cdaa320dcc65c16f4585c5df73f1 Mon Sep 17 00:00:00 2001 From: Willem Date: Tue, 20 May 2025 15:51:03 -0400 Subject: [PATCH] feat: install discord alert manager helm chart when Discord is the chosen alerting channel --- harmony/src/modules/monitoring/config.rs | 42 ++-- .../monitoring/discord_alert_manager.rs | 37 ++++ .../src/modules/monitoring/kube_prometheus.rs | 179 ++++++++++++++---- harmony/src/modules/monitoring/mod.rs | 1 + .../modules/monitoring/monitoring_alerting.rs | 154 +++++++++++---- 5 files changed, 326 insertions(+), 87 deletions(-) create mode 100644 harmony/src/modules/monitoring/discord_alert_manager.rs diff --git a/harmony/src/modules/monitoring/config.rs b/harmony/src/modules/monitoring/config.rs index 412d713..7a073bc 100644 --- a/harmony/src/modules/monitoring/config.rs +++ b/harmony/src/modules/monitoring/config.rs @@ -2,46 +2,54 @@ use email_address::EmailAddress; use serde::Serialize; use url::Url; +use super::monitoring_alerting::WebhookServiceType; + #[derive(Debug, Clone, Serialize)] pub struct KubePrometheusConfig { pub namespace: String, - pub node_exporter: bool, + pub default_rules: bool, + pub windows_monitoring: bool, pub alert_manager: bool, + pub node_exporter: bool, pub prometheus: bool, pub grafana: bool, - pub windows_monitoring: bool, pub kubernetes_service_monitors: bool, + pub kubernetes_api_server: bool, pub kubelet: bool, pub kube_controller_manager: bool, + pub core_dns: bool, pub kube_etcd: bool, + pub kube_scheduler: bool, pub kube_proxy: bool, pub kube_state_metrics: bool, pub prometheus_operator: bool, pub webhook_url: Option, - pub webhook_service_name: Option, - pub smpt_email_address: Option, - pub smtp_service_name: Option, + pub webhook_service_type: Option, + pub discord_alert_manager_release_name: String, } impl KubePrometheusConfig { pub fn new() -> Self { Self { namespace: "monitoring".into(), - node_exporter: false, - alert_manager: false, - prometheus: true, - grafana: true, + default_rules: true, windows_monitoring: false, + alert_manager: true, + webhook_service_type: None, + webhook_url: None, + grafana: true, + node_exporter: false, + prometheus: true, kubernetes_service_monitors: true, - kubelet: true, - kube_controller_manager: true, - kube_etcd: true, - kube_proxy: true, + kubernetes_api_server: false, + kubelet: false, + kube_controller_manager: false, + kube_etcd: false, + kube_proxy: false, kube_state_metrics: true, prometheus_operator: true, - webhook_url: None, - webhook_service_name: None, - smpt_email_address: None, - smtp_service_name: None, + core_dns: false, + kube_scheduler: false, + discord_alert_manager_release_name: "discord-alert-manager".into(), } } } diff --git a/harmony/src/modules/monitoring/discord_alert_manager.rs b/harmony/src/modules/monitoring/discord_alert_manager.rs new file mode 100644 index 0000000..868b1b7 --- /dev/null +++ b/harmony/src/modules/monitoring/discord_alert_manager.rs @@ -0,0 +1,37 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; + +use super::config::KubePrometheusConfig; + +pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore { + let url = if let Some(url) = &config.webhook_url { + url.to_string() + } else { + "None".to_string() + }; + + let values = format!( + r#" + +environment: + - name: "DISCORD_WEBHOOK" + value: "{url}" + "#, + ); + + HelmChartScore { + namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), + release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(), + chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord") + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: Some(values.to_string()), + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus.rs b/harmony/src/modules/monitoring/kube_prometheus.rs index 296891e..ed7916e 100644 --- a/harmony/src/modules/monitoring/kube_prometheus.rs +++ b/harmony/src/modules/monitoring/kube_prometheus.rs @@ -1,4 +1,4 @@ -use super::config::KubePrometheusConfig; +use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType}; use non_blank_string_rs::NonBlankString; use std::{collections::HashMap, str::FromStr}; @@ -7,8 +7,61 @@ use crate::modules::helm::chart::HelmChartScore; pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmChartScore { //TODO this should be make into a rule with default formatting that can be easily passed as a vec //to the overrides or something leaving the user to deal with formatting here seems bad - let values = r#" + let default_rules = config.default_rules.to_string(); + let windows_monitoring = config.windows_monitoring.to_string(); + let alert_manager = config.alert_manager.to_string(); + let webhook_service_type = if let Some(service) = &config.webhook_service_type { + match service { + WebhookServiceType::Discord => "Discord".to_string(), + WebhookServiceType::Slack => "Slack".to_string(), + } + } else { + "None".to_string() + }; + let grafana = config.grafana.to_string(); + let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); + let kubernetes_api_server = config.kubernetes_api_server.to_string(); + let kubelet = config.kubelet.to_string(); + let kube_controller_manager = config.kube_controller_manager.to_string(); + let core_dns = config.core_dns.to_string(); + let kube_etcd = config.kube_etcd.to_string(); + let kube_scheduler = config.kube_scheduler.to_string(); + let kube_proxy = config.kube_proxy.to_string(); + let kube_state_metrics = config.kube_state_metrics.to_string(); + let node_exporter = config.node_exporter.to_string(); + let prometheus_operator = config.prometheus_operator.to_string(); + let prometheus = config.prometheus.to_string(); + let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string(); + let values = format!( + r#" additionalPrometheusRulesMap: + pods-status-alerts: + groups: + - name: pods + rules: + - alert: "[CRIT] POD not healthy" + expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + title: "[CRIT] POD not healthy : {{ $labels.pod }}" + description: | + A POD is in a non-ready state! + - **Pod**: {{ $labels.pod }} + - **Namespace**: {{ $labels.namespace }} + - alert: "[CRIT] POD crash looping" + expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 + for: 0m + labels: + severity: critical + annotations: + title: "[CRIT] POD crash looping : {{ $labels.pod }}" + description: | + A POD is drowning in a crash loop! + - **Pod**: {{ $labels.pod }} + - **Namespace**: {{ $labels.namespace }} + - **Instance**: {{ $labels.instance }} pvc-alerts: groups: - name: pvc-alerts @@ -31,49 +84,103 @@ additionalPrometheusRulesMap: annotations: description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days. title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days -"#; - let mut values_overrides: HashMap = HashMap::new(); - - macro_rules! insert_flag { - ($key:expr, $val:expr) => { - values_overrides.insert(NonBlankString::from_str($key).unwrap(), $val.to_string()); - }; - } - - insert_flag!("nodeExporter.enabled", config.node_exporter); - insert_flag!("windowsMonitoring.enabled", config.windows_monitoring); - insert_flag!("grafana.enabled", config.grafana); - insert_flag!("alertmanager.enabled", config.alert_manager); - insert_flag!("prometheus.enabled", config.prometheus); - insert_flag!( - "kubernetes_service_monitors.enabled", - config.kubernetes_service_monitors +defaultRules: + create: {default_rules} + rules: + alertmanager: true + etcd: true + configReloaders: true + general: true + k8sContainerCpuUsageSecondsTotal: true + k8sContainerMemoryCache: true + k8sContainerMemoryRss: true + k8sContainerMemorySwap: true + k8sContainerResource: true + k8sContainerMemoryWorkingSetBytes: true + k8sPodOwner: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: true + kubelet: true + kubeProxy: true + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeSchedulerAlerting: true + kubeSchedulerRecording: true + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true + windows: true +windowsMonitoring: + enabled: {windows_monitoring} +alertmanager: + enabled: {alert_manager} + config: + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: '{webhook_service_type}' + routes: + - receiver: 'null' + matchers: + - alertname="Watchdog" + continue: false + receivers: + - name: 'null' + - name: '{webhook_service_type}' + webhook_configs: + - url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094' + send_resolved: true +grafana: + enabled: {grafana} +kubernetesServiceMonitors: + enabled: {kubernetes_service_monitors} +kubeApiServer: + enabled: {kubernetes_api_server} +kubelet: + enabled: {kubelet} +kubeControllerManager: + enabled: {kube_controller_manager} +coreDns: + enabled: {core_dns} +kubeEtcd: + enabled: {kube_etcd} +kubeScheduler: + enabled: {kube_scheduler} +kubeProxy: + enabled: {kube_proxy} +kubeStateMetrics: + enabled: {kube_state_metrics} +nodeExporter: + enabled: {node_exporter} +prometheusOperator: + enabled: {prometheus_operator} +prometheus: + enabled: {prometheus} +"#, ); - insert_flag!("kubelet.enabled", config.kubelet); - insert_flag!( - "kubeControllerManager.enabled", - config.kube_controller_manager - ); - insert_flag!("kubeProxy.enabled", config.kube_proxy); - insert_flag!("kubeEtcd.enabled", config.kube_etcd); - insert_flag!("kubeStateMetrics.enabled", config.kube_state_metrics); - insert_flag!("prometheusOperator.enabled", config.prometheus_operator); - - if let (Some(url), Some(name)) = (&config.webhook_url, &config.webhook_service_name) { - insert_flag!("alertmanager.config.receivers.webhook_configs.url", url.as_str()); - insert_flag!("alertmanager.config.receivers.name", name.as_str()); - } HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), release_name: NonBlankString::from_str("kube-prometheus").unwrap(), chart_name: NonBlankString::from_str( - "oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", //use kube prometheus chart which includes grafana, prometheus, alert - //manager, etc + "oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", ) .unwrap(), chart_version: None, - values_overrides: Some(values_overrides), + values_overrides: None, values_yaml: Some(values.to_string()), create_namespace: true, install_only: true, diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index 01bb194..d880a67 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,3 +1,4 @@ mod kube_prometheus; pub mod monitoring_alerting; +mod discord_alert_manager; mod config; diff --git a/harmony/src/modules/monitoring/monitoring_alerting.rs b/harmony/src/modules/monitoring/monitoring_alerting.rs index 0ec6adf..6bb6e83 100644 --- a/harmony/src/modules/monitoring/monitoring_alerting.rs +++ b/harmony/src/modules/monitoring/monitoring_alerting.rs @@ -1,22 +1,31 @@ +use async_trait::async_trait; use email_address::EmailAddress; +use log::info; use serde::Serialize; use url::Url; use crate::{ - interpret::Interpret, + data::{Id, Version}, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, score::Score, topology::{HelmCommand, Topology}, }; -use super::{config::KubePrometheusConfig, kube_prometheus::kube_prometheus_helm_chart_score}; +use super::{ + config::KubePrometheusConfig, discord_alert_manager::discord_alert_manager_score, + kube_prometheus::kube_prometheus_helm_chart_score, +}; #[derive(Debug, Clone, Serialize)] pub enum AlertChannel { WebHookUrl { url: Url, - service_name: String, + webhook_service_type: WebhookServiceType, }, + //TODO test and implement in helm chart + //currently does not work Smpt { email_address: EmailAddress, service_name: String, @@ -24,15 +33,15 @@ pub enum AlertChannel { } #[derive(Debug, Clone, Serialize)] -pub enum Stack { - KubePrometheusStack, - OtherStack, +pub enum WebhookServiceType { + Discord, + //TODO test slack notifications + Slack, } #[derive(Debug, Clone, Serialize)] pub struct MonitoringAlertingStackScore { pub alert_channel: Option, - pub monitoring_stack: Stack, pub namespace: Option, } @@ -40,54 +49,131 @@ impl MonitoringAlertingStackScore { pub fn new() -> Self { Self { alert_channel: None, - monitoring_stack: Stack::KubePrometheusStack, namespace: None, } } - fn match_alert_channel(&self, config: &mut KubePrometheusConfig) { + fn set_alert_channel(&self, config: &mut KubePrometheusConfig) { if let Some(alert_channel) = &self.alert_channel { match alert_channel { - //opt1 - AlertChannel::WebHookUrl { url, service_name } => { + AlertChannel::WebHookUrl { + url, + webhook_service_type, + } => { config.webhook_url = Some(url.clone()); - config.webhook_service_name = Some(service_name.clone()); + config.webhook_service_type = Some(webhook_service_type.clone()); } - //opt2 AlertChannel::Smpt { + //TODO setup smpt alerts email_address, service_name, } => { - config.smpt_email_address = Some(email_address.clone()); - config.smtp_service_name = Some(service_name.clone()); + todo!() } } } } - fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig { - let mut config = KubePrometheusConfig::new(); - self.match_alert_channel(&mut config); - if let Some(ns) = &self.namespace { - config.namespace = ns.clone(); - } - config - } } impl Score for MonitoringAlertingStackScore { fn create_interpret(&self) -> Box> { - match &self.monitoring_stack { - Stack::KubePrometheusStack => { - let config = self.build_kube_prometheus_helm_chart_config(); - let helm_chart = kube_prometheus_helm_chart_score(&config); - helm_chart.create_interpret() - } - Stack::OtherStack => { - todo!() - } - } + Box::new(MonitoringAlertingStackInterpret { + score: self.clone(), + }) } - fn name(&self) -> String { format!("MonitoringAlertingStackScore") } } + +#[derive(Debug, Clone, Serialize)] +struct MonitoringAlertingStackInterpret { + score: MonitoringAlertingStackScore, +} + +impl MonitoringAlertingStackInterpret { + async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig { + let mut config = KubePrometheusConfig::new(); + self.score.set_alert_channel(&mut config); + if let Some(ns) = &self.score.namespace { + config.namespace = ns.clone(); + } + config + } + + async fn deploy_kube_prometheus_helm_chart_score( + &self, + inventory: &Inventory, + topology: &T, + config: &KubePrometheusConfig, + ) -> Result { + let helm_chart = kube_prometheus_helm_chart_score(config); + helm_chart + .create_interpret() + .execute(inventory, topology) + .await + } + + async fn deploy_alert_channel_service( + &self, + inventory: &Inventory, + topology: &T, + config: &KubePrometheusConfig, + ) -> Result { + match &self.score.alert_channel { + Some(AlertChannel::WebHookUrl { + webhook_service_type, + .. + }) => match webhook_service_type { + WebhookServiceType::Discord => { + discord_alert_manager_score(config) + .create_interpret() + .execute(inventory, topology) + .await + } + WebhookServiceType::Slack => Ok(Outcome::success( + "No extra configs for slack alerting".to_string(), + )), + }, + Some(AlertChannel::Smpt { .. }) => { + todo!() + } + None => Ok(Outcome::success("No alert channel configured".to_string())), + } + } +} + +#[async_trait] +impl Interpret for MonitoringAlertingStackInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let config = self.build_kube_prometheus_helm_chart_config().await; + info!("Built kube prometheus config"); + info!("Installing kube prometheus chart"); + self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config) + .await?; + info!("Installing alert channel service"); + self.deploy_alert_channel_service(inventory, topology, &config).await?; + Ok(Outcome::success(format!( + "succesfully deployed monitoring and alerting stack" + ))) + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +}