diff --git a/harmony/src/modules/monitoring/config.rs b/harmony/src/modules/monitoring/config.rs index 7a073bc..c06377c 100644 --- a/harmony/src/modules/monitoring/config.rs +++ b/harmony/src/modules/monitoring/config.rs @@ -1,8 +1,7 @@ -use email_address::EmailAddress; use serde::Serialize; -use url::Url; -use super::monitoring_alerting::WebhookServiceType; +use super::monitoring_alerting::AlertChannel; + #[derive(Debug, Clone, Serialize)] pub struct KubePrometheusConfig { @@ -23,9 +22,7 @@ pub struct KubePrometheusConfig { pub kube_proxy: bool, pub kube_state_metrics: bool, pub prometheus_operator: bool, - pub webhook_url: Option, - pub webhook_service_type: Option, - pub discord_alert_manager_release_name: String, + pub alert_channel: Vec, } impl KubePrometheusConfig { pub fn new() -> Self { @@ -34,8 +31,7 @@ impl KubePrometheusConfig { default_rules: true, windows_monitoring: false, alert_manager: true, - webhook_service_type: None, - webhook_url: None, + alert_channel: Vec::new(), grafana: true, node_exporter: false, prometheus: true, @@ -49,7 +45,6 @@ impl KubePrometheusConfig { prometheus_operator: true, core_dns: false, kube_scheduler: false, - discord_alert_manager_release_name: "discord-alert-manager".into(), } } } diff --git a/harmony/src/modules/monitoring/discord_alert_manager.rs b/harmony/src/modules/monitoring/discord_alert_manager.rs index 868b1b7..5eaffa0 100644 --- a/harmony/src/modules/monitoring/discord_alert_manager.rs +++ b/harmony/src/modules/monitoring/discord_alert_manager.rs @@ -4,28 +4,29 @@ use non_blank_string_rs::NonBlankString; use crate::modules::helm::chart::HelmChartScore; -use super::config::KubePrometheusConfig; +use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel}; -pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore { - let url = if let Some(url) = &config.webhook_url { - url.to_string() - } else { - "None".to_string() - }; +fn get_discord_alert_manager_score(config: &KubePrometheusConfig) -> Option { + let (url, name) = config.alert_channel.iter().find_map(|channel| { + if let AlertChannel::Discord { webhook_url, name } = channel { + Some((webhook_url, name)) + } else { + None + } + })?; let values = format!( - r#" - + r#" environment: - name: "DISCORD_WEBHOOK" value: "{url}" - "#, +"#, ); - HelmChartScore { + Some(HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), - release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(), - chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord") + release_name: NonBlankString::from_str(&name).unwrap(), + chart_name: NonBlankString::from_str("oci://hub.nationtech.io/library/alertmanager-discord") .unwrap(), chart_version: None, values_overrides: None, @@ -33,5 +34,13 @@ environment: create_namespace: true, install_only: true, repository: None, + }) +} + +pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore { + if let Some(chart) = get_discord_alert_manager_score(config) { + chart + } else { + panic!("Expected discord alert manager helm chart"); } } diff --git a/harmony/src/modules/monitoring/kube_prometheus.rs b/harmony/src/modules/monitoring/kube_prometheus.rs index ed7916e..b694f51 100644 --- a/harmony/src/modules/monitoring/kube_prometheus.rs +++ b/harmony/src/modules/monitoring/kube_prometheus.rs @@ -1,6 +1,8 @@ -use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType}; +use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel}; +use log::info; use non_blank_string_rs::NonBlankString; use std::{collections::HashMap, str::FromStr}; +use url::Url; use crate::modules::helm::chart::HelmChartScore; @@ -10,14 +12,6 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh let default_rules = config.default_rules.to_string(); let windows_monitoring = config.windows_monitoring.to_string(); let alert_manager = config.alert_manager.to_string(); - let webhook_service_type = if let Some(service) = &config.webhook_service_type { - match service { - WebhookServiceType::Discord => "Discord".to_string(), - WebhookServiceType::Slack => "Slack".to_string(), - } - } else { - "None".to_string() - }; let grafana = config.grafana.to_string(); let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); let kubernetes_api_server = config.kubernetes_api_server.to_string(); @@ -31,8 +25,7 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh let node_exporter = config.node_exporter.to_string(); let prometheus_operator = config.prometheus_operator.to_string(); let prometheus = config.prometheus.to_string(); - let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string(); - let values = format!( + let mut values = format!( r#" additionalPrometheusRulesMap: pods-status-alerts: @@ -45,23 +38,23 @@ additionalPrometheusRulesMap: labels: severity: critical annotations: - title: "[CRIT] POD not healthy : {{ $labels.pod }}" + title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}" description: | A POD is in a non-ready state! - - **Pod**: {{ $labels.pod }} - - **Namespace**: {{ $labels.namespace }} + - **Pod**: {{{{ $labels.pod }}}} + - **Namespace**: {{{{ $labels.namespace }}}} - alert: "[CRIT] POD crash looping" expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 for: 0m labels: severity: critical annotations: - title: "[CRIT] POD crash looping : {{ $labels.pod }}" + title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}" description: | A POD is drowning in a crash loop! - - **Pod**: {{ $labels.pod }} - - **Namespace**: {{ $labels.namespace }} - - **Instance**: {{ $labels.instance }} + - **Pod**: {{{{ $labels.pod }}}} + - **Namespace**: {{{{ $labels.namespace }}}} + - **Instance**: {{{{ $labels.instance }}}} pvc-alerts: groups: - name: pvc-alerts @@ -82,8 +75,8 @@ additionalPrometheusRulesMap: labels: severity: warning annotations: - description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days. - title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days + description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days. + title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days defaultRules: create: {default_rules} rules: @@ -123,26 +116,6 @@ defaultRules: windows: true windowsMonitoring: enabled: {windows_monitoring} -alertmanager: - enabled: {alert_manager} - config: - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: '{webhook_service_type}' - routes: - - receiver: 'null' - matchers: - - alertname="Watchdog" - continue: false - receivers: - - name: 'null' - - name: '{webhook_service_type}' - webhook_configs: - - url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094' - send_resolved: true grafana: enabled: {grafana} kubernetesServiceMonitors: @@ -172,6 +145,66 @@ prometheus: "#, ); + let alertmanager_config = alert_manager_yaml_builder(&config); + values.push_str(&alertmanager_config); + + fn alert_manager_yaml_builder(config: &KubePrometheusConfig) -> String { + let mut receivers = String::new(); + let mut routes = String::new(); + let mut global_configs = String::new(); + let alert_manager = config.alert_manager; + for alert_channel in &config.alert_channel { + match alert_channel { + AlertChannel::Discord { name, .. } => { + let (receiver, route) = discord_alert_builder(name); + info!("discord receiver: {} \nroute: {}", receiver, route); + receivers.push_str(&receiver); + routes.push_str(&route); + } + AlertChannel::Slack { + slack_channel, + webhook_url, + } => { + let (receiver, route) = slack_alert_builder(slack_channel); + info!("slack receiver: {} \nroute: {}", receiver, route); + receivers.push_str(&receiver); + + routes.push_str(&route); + let global_config = format!( + r#" + global: + slack_api_url: {webhook_url}"# + ); + + global_configs.push_str(&global_config); + } + AlertChannel::Smpt { .. } => todo!(), + } + } + info!("after alert receiver: {}", receivers); + info!("after alert routes: {}", routes); + + let alertmanager_config = format!( + r#" +alertmanager: + enabled: {alert_manager} + config: {global_configs} + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + routes: +{routes} + receivers: + - name: 'null' +{receivers}"# + ); + + info!("alert manager config: {}", alertmanager_config); + alertmanager_config + } + HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), release_name: NonBlankString::from_str("kube-prometheus").unwrap(), @@ -187,3 +220,43 @@ prometheus: repository: None, } } + +fn discord_alert_builder(release_name: &String) -> (String, String) { + let discord_receiver_name = format!("Discord-{}", release_name); + let receiver = format!( + r#" + - name: '{discord_receiver_name}' + webhook_configs: + - url: 'http://{release_name}-alertmanager-discord:9094' + send_resolved: true"#, + ); + let route = format!( + r#" + - receiver: '{discord_receiver_name}' + matchers: + - alertname!=Watchdog + continue: true"#, + ); + (receiver, route) +} + +fn slack_alert_builder(slack_channel: &String) -> (String, String) { + let slack_receiver_name = format!("Slack-{}", slack_channel); + let receiver = format!( + r#" + - name: '{slack_receiver_name}' + slack_configs: + - channel: '{slack_channel}' + send_resolved: true + title: '{{{{ .CommonAnnotations.title }}}}' + text: '{{{{ .CommonAnnotations.description }}}}'"#, + ); + let route = format!( + r#" + - receiver: '{slack_receiver_name}' + matchers: + - alertname!=Watchdog + continue: true"#, + ); + (receiver, route) +} diff --git a/harmony/src/modules/monitoring/monitoring_alerting.rs b/harmony/src/modules/monitoring/monitoring_alerting.rs index 6bb6e83..0e7c2d4 100644 --- a/harmony/src/modules/monitoring/monitoring_alerting.rs +++ b/harmony/src/modules/monitoring/monitoring_alerting.rs @@ -20,9 +20,13 @@ use super::{ #[derive(Debug, Clone, Serialize)] pub enum AlertChannel { - WebHookUrl { - url: Url, - webhook_service_type: WebhookServiceType, + Discord { + name: String, + webhook_url: Url, + }, + Slack { + slack_channel: String, + webhook_url: Url, }, //TODO test and implement in helm chart //currently does not work @@ -32,46 +36,19 @@ pub enum AlertChannel { }, } -#[derive(Debug, Clone, Serialize)] -pub enum WebhookServiceType { - Discord, - //TODO test slack notifications - Slack, -} - #[derive(Debug, Clone, Serialize)] pub struct MonitoringAlertingStackScore { - pub alert_channel: Option, + pub alert_channel: Vec, pub namespace: Option, } impl MonitoringAlertingStackScore { pub fn new() -> Self { Self { - alert_channel: None, + alert_channel: Vec::new(), namespace: None, } } - fn set_alert_channel(&self, config: &mut KubePrometheusConfig) { - if let Some(alert_channel) = &self.alert_channel { - match alert_channel { - AlertChannel::WebHookUrl { - url, - webhook_service_type, - } => { - config.webhook_url = Some(url.clone()); - config.webhook_service_type = Some(webhook_service_type.clone()); - } - AlertChannel::Smpt { - //TODO setup smpt alerts - email_address, - service_name, - } => { - todo!() - } - } - } - } } impl Score for MonitoringAlertingStackScore { @@ -93,10 +70,10 @@ struct MonitoringAlertingStackInterpret { impl MonitoringAlertingStackInterpret { async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig { let mut config = KubePrometheusConfig::new(); - self.score.set_alert_channel(&mut config); if let Some(ns) = &self.score.namespace { config.namespace = ns.clone(); } + config.alert_channel = self.score.alert_channel.clone(); config } @@ -119,26 +96,30 @@ impl MonitoringAlertingStackInterpret { topology: &T, config: &KubePrometheusConfig, ) -> Result { - match &self.score.alert_channel { - Some(AlertChannel::WebHookUrl { - webhook_service_type, - .. - }) => match webhook_service_type { - WebhookServiceType::Discord => { + let mut outcomes = vec![]; + + for channel in &self.score.alert_channel { + let outcome = match channel { + AlertChannel::Discord { .. } => { discord_alert_manager_score(config) .create_interpret() .execute(inventory, topology) .await } - WebhookServiceType::Slack => Ok(Outcome::success( + AlertChannel::Slack { .. } => Ok(Outcome::success( "No extra configs for slack alerting".to_string(), )), - }, - Some(AlertChannel::Smpt { .. }) => { - todo!() - } - None => Ok(Outcome::success("No alert channel configured".to_string())), + AlertChannel::Smpt { .. } => { + todo!() + } + }; + outcomes.push(outcome); } + for result in outcomes { + result?; + } + + Ok(Outcome::success("All alert channels deployed".to_string())) } } @@ -155,7 +136,8 @@ impl Interpret for MonitoringAlertingStackInterpre self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config) .await?; info!("Installing alert channel service"); - self.deploy_alert_channel_service(inventory, topology, &config).await?; + self.deploy_alert_channel_service(inventory, topology, &config) + .await?; Ok(Outcome::success(format!( "succesfully deployed monitoring and alerting stack" )))