diff --git a/Cargo.lock b/Cargo.lock index f0558a5..63c8897 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -930,6 +930,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "email_address" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449" +dependencies = [ + "serde", +] + [[package]] name = "encoding_rs" version = "0.8.35" @@ -1400,6 +1409,7 @@ dependencies = [ "derive-new", "directories", "dockerfile_builder", + "email_address", "env_logger", "fqdn", "harmony_macros", diff --git a/examples/lamp/src/main.rs b/examples/lamp/src/main.rs index 06d8534..feac05d 100644 --- a/examples/lamp/src/main.rs +++ b/examples/lamp/src/main.rs @@ -4,7 +4,9 @@ use harmony::{ maestro::Maestro, modules::{ lamp::{LAMPConfig, LAMPScore}, - monitoring::monitoring_alerting::MonitoringAlertingStackScore, + monitoring::monitoring_alerting::{ + AlertChannel, MonitoringAlertingStackScore, WebhookServiceType, + }, }, topology::{K8sAnywhereTopology, Url}, }; @@ -43,8 +45,15 @@ async fn main() { .await .unwrap(); - let monitoring_stack_score = - MonitoringAlertingStackScore::new_with_ns(&lamp_stack.config.namespace); + let url = url::Url::parse("https://discord.com/api/webhooks/dummy_channel/dummy_token") + .expect("invalid URL"); + + let mut monitoring_stack_score = MonitoringAlertingStackScore::new(); + monitoring_stack_score.namespace = Some(lamp_stack.config.namespace.clone()); + monitoring_stack_score.alert_channel = Some(AlertChannel::WebHookUrl { + url: url, + webhook_service_type: WebhookServiceType::Discord, + }); maestro.register_all(vec![Box::new(lamp_stack), Box::new(monitoring_stack_score)]); // Here we bootstrap the CLI, this gives some nice features if you need them diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml index 7128ee7..5bc88b1 100644 --- a/harmony/Cargo.toml +++ b/harmony/Cargo.toml @@ -39,6 +39,7 @@ lazy_static = "1.5.0" dockerfile_builder = "0.1.5" temp-file = "0.1.9" convert_case.workspace = true +email_address = "0.2.9" fqdn = { version = "0.4.6", features = [ "domain-label-cannot-start-or-end-with-hyphen", "domain-label-length-limited-to-63", diff --git a/harmony/src/modules/monitoring/config.rs b/harmony/src/modules/monitoring/config.rs new file mode 100644 index 0000000..7a073bc --- /dev/null +++ b/harmony/src/modules/monitoring/config.rs @@ -0,0 +1,55 @@ +use email_address::EmailAddress; +use serde::Serialize; +use url::Url; + +use super::monitoring_alerting::WebhookServiceType; + +#[derive(Debug, Clone, Serialize)] +pub struct KubePrometheusConfig { + pub namespace: String, + pub default_rules: bool, + pub windows_monitoring: bool, + pub alert_manager: bool, + pub node_exporter: bool, + pub prometheus: bool, + pub grafana: bool, + pub kubernetes_service_monitors: bool, + pub kubernetes_api_server: bool, + pub kubelet: bool, + pub kube_controller_manager: bool, + pub core_dns: bool, + pub kube_etcd: bool, + pub kube_scheduler: bool, + pub kube_proxy: bool, + pub kube_state_metrics: bool, + pub prometheus_operator: bool, + pub webhook_url: Option, + pub webhook_service_type: Option, + pub discord_alert_manager_release_name: String, +} +impl KubePrometheusConfig { + pub fn new() -> Self { + Self { + namespace: "monitoring".into(), + default_rules: true, + windows_monitoring: false, + alert_manager: true, + webhook_service_type: None, + webhook_url: None, + grafana: true, + node_exporter: false, + prometheus: true, + kubernetes_service_monitors: true, + kubernetes_api_server: false, + kubelet: false, + kube_controller_manager: false, + kube_etcd: false, + kube_proxy: false, + kube_state_metrics: true, + prometheus_operator: true, + core_dns: false, + kube_scheduler: false, + discord_alert_manager_release_name: "discord-alert-manager".into(), + } + } +} diff --git a/harmony/src/modules/monitoring/discord_alert_manager.rs b/harmony/src/modules/monitoring/discord_alert_manager.rs new file mode 100644 index 0000000..868b1b7 --- /dev/null +++ b/harmony/src/modules/monitoring/discord_alert_manager.rs @@ -0,0 +1,37 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; + +use super::config::KubePrometheusConfig; + +pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore { + let url = if let Some(url) = &config.webhook_url { + url.to_string() + } else { + "None".to_string() + }; + + let values = format!( + r#" + +environment: + - name: "DISCORD_WEBHOOK" + value: "{url}" + "#, + ); + + HelmChartScore { + namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), + release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(), + chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord") + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: Some(values.to_string()), + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus.rs b/harmony/src/modules/monitoring/kube_prometheus.rs index c729c96..ed7916e 100644 --- a/harmony/src/modules/monitoring/kube_prometheus.rs +++ b/harmony/src/modules/monitoring/kube_prometheus.rs @@ -1,14 +1,67 @@ -use std::str::FromStr; - +use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType}; use non_blank_string_rs::NonBlankString; +use std::{collections::HashMap, str::FromStr}; use crate::modules::helm::chart::HelmChartScore; -pub fn kube_prometheus_score(ns: &str) -> HelmChartScore { +pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmChartScore { //TODO this should be make into a rule with default formatting that can be easily passed as a vec //to the overrides or something leaving the user to deal with formatting here seems bad - let values = r#" + let default_rules = config.default_rules.to_string(); + let windows_monitoring = config.windows_monitoring.to_string(); + let alert_manager = config.alert_manager.to_string(); + let webhook_service_type = if let Some(service) = &config.webhook_service_type { + match service { + WebhookServiceType::Discord => "Discord".to_string(), + WebhookServiceType::Slack => "Slack".to_string(), + } + } else { + "None".to_string() + }; + let grafana = config.grafana.to_string(); + let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); + let kubernetes_api_server = config.kubernetes_api_server.to_string(); + let kubelet = config.kubelet.to_string(); + let kube_controller_manager = config.kube_controller_manager.to_string(); + let core_dns = config.core_dns.to_string(); + let kube_etcd = config.kube_etcd.to_string(); + let kube_scheduler = config.kube_scheduler.to_string(); + let kube_proxy = config.kube_proxy.to_string(); + let kube_state_metrics = config.kube_state_metrics.to_string(); + let node_exporter = config.node_exporter.to_string(); + let prometheus_operator = config.prometheus_operator.to_string(); + let prometheus = config.prometheus.to_string(); + let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string(); + let values = format!( + r#" additionalPrometheusRulesMap: + pods-status-alerts: + groups: + - name: pods + rules: + - alert: "[CRIT] POD not healthy" + expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + title: "[CRIT] POD not healthy : {{ $labels.pod }}" + description: | + A POD is in a non-ready state! + - **Pod**: {{ $labels.pod }} + - **Namespace**: {{ $labels.namespace }} + - alert: "[CRIT] POD crash looping" + expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 + for: 0m + labels: + severity: critical + annotations: + title: "[CRIT] POD crash looping : {{ $labels.pod }}" + description: | + A POD is drowning in a crash loop! + - **Pod**: {{ $labels.pod }} + - **Namespace**: {{ $labels.namespace }} + - **Instance**: {{ $labels.instance }} pvc-alerts: groups: - name: pvc-alerts @@ -31,13 +84,99 @@ additionalPrometheusRulesMap: annotations: description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days. title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days -"#; +defaultRules: + create: {default_rules} + rules: + alertmanager: true + etcd: true + configReloaders: true + general: true + k8sContainerCpuUsageSecondsTotal: true + k8sContainerMemoryCache: true + k8sContainerMemoryRss: true + k8sContainerMemorySwap: true + k8sContainerResource: true + k8sContainerMemoryWorkingSetBytes: true + k8sPodOwner: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: true + kubelet: true + kubeProxy: true + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeSchedulerAlerting: true + kubeSchedulerRecording: true + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true + windows: true +windowsMonitoring: + enabled: {windows_monitoring} +alertmanager: + enabled: {alert_manager} + config: + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: '{webhook_service_type}' + routes: + - receiver: 'null' + matchers: + - alertname="Watchdog" + continue: false + receivers: + - name: 'null' + - name: '{webhook_service_type}' + webhook_configs: + - url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094' + send_resolved: true +grafana: + enabled: {grafana} +kubernetesServiceMonitors: + enabled: {kubernetes_service_monitors} +kubeApiServer: + enabled: {kubernetes_api_server} +kubelet: + enabled: {kubelet} +kubeControllerManager: + enabled: {kube_controller_manager} +coreDns: + enabled: {core_dns} +kubeEtcd: + enabled: {kube_etcd} +kubeScheduler: + enabled: {kube_scheduler} +kubeProxy: + enabled: {kube_proxy} +kubeStateMetrics: + enabled: {kube_state_metrics} +nodeExporter: + enabled: {node_exporter} +prometheusOperator: + enabled: {prometheus_operator} +prometheus: + enabled: {prometheus} +"#, + ); + HelmChartScore { - namespace: Some(NonBlankString::from_str(ns).unwrap()), + namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), release_name: NonBlankString::from_str("kube-prometheus").unwrap(), chart_name: NonBlankString::from_str( - "oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", //use kube prometheus chart which includes grafana, prometheus, alert - //manager, etc + "oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", ) .unwrap(), chart_version: None, diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index 3c73fad..d880a67 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,2 +1,4 @@ mod kube_prometheus; pub mod monitoring_alerting; +mod discord_alert_manager; +mod config; diff --git a/harmony/src/modules/monitoring/monitoring_alerting.rs b/harmony/src/modules/monitoring/monitoring_alerting.rs index a08b038..6bb6e83 100644 --- a/harmony/src/modules/monitoring/monitoring_alerting.rs +++ b/harmony/src/modules/monitoring/monitoring_alerting.rs @@ -1,47 +1,179 @@ +use async_trait::async_trait; +use email_address::EmailAddress; + +use log::info; use serde::Serialize; +use url::Url; use crate::{ - interpret::Interpret, - modules::helm::chart::HelmChartScore, + data::{Id, Version}, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, score::Score, topology::{HelmCommand, Topology}, }; -use super::kube_prometheus::kube_prometheus_score; +use super::{ + config::KubePrometheusConfig, discord_alert_manager::discord_alert_manager_score, + kube_prometheus::kube_prometheus_helm_chart_score, +}; + +#[derive(Debug, Clone, Serialize)] +pub enum AlertChannel { + WebHookUrl { + url: Url, + webhook_service_type: WebhookServiceType, + }, + //TODO test and implement in helm chart + //currently does not work + Smpt { + email_address: EmailAddress, + service_name: String, + }, +} + +#[derive(Debug, Clone, Serialize)] +pub enum WebhookServiceType { + Discord, + //TODO test slack notifications + Slack, +} #[derive(Debug, Clone, Serialize)] pub struct MonitoringAlertingStackScore { - // TODO Support other components in our monitoring/alerting stack instead of a single helm - // chart - pub monitoring_stack: HelmChartScore, - pub namespace: String, + pub alert_channel: Option, + pub namespace: Option, } impl MonitoringAlertingStackScore { - pub fn new_with_ns(ns: &str) -> Self { + pub fn new() -> Self { Self { - monitoring_stack: kube_prometheus_score(ns), - namespace: ns.to_string(), + alert_channel: None, + namespace: None, } } -} - -impl Default for MonitoringAlertingStackScore { - fn default() -> Self { - let ns = "monitoring"; - Self { - monitoring_stack: kube_prometheus_score(ns), - namespace: ns.to_string(), + fn set_alert_channel(&self, config: &mut KubePrometheusConfig) { + if let Some(alert_channel) = &self.alert_channel { + match alert_channel { + AlertChannel::WebHookUrl { + url, + webhook_service_type, + } => { + config.webhook_url = Some(url.clone()); + config.webhook_service_type = Some(webhook_service_type.clone()); + } + AlertChannel::Smpt { + //TODO setup smpt alerts + email_address, + service_name, + } => { + todo!() + } + } } } } impl Score for MonitoringAlertingStackScore { fn create_interpret(&self) -> Box> { - self.monitoring_stack.create_interpret() + Box::new(MonitoringAlertingStackInterpret { + score: self.clone(), + }) } - fn name(&self) -> String { format!("MonitoringAlertingStackScore") } } + +#[derive(Debug, Clone, Serialize)] +struct MonitoringAlertingStackInterpret { + score: MonitoringAlertingStackScore, +} + +impl MonitoringAlertingStackInterpret { + async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig { + let mut config = KubePrometheusConfig::new(); + self.score.set_alert_channel(&mut config); + if let Some(ns) = &self.score.namespace { + config.namespace = ns.clone(); + } + config + } + + async fn deploy_kube_prometheus_helm_chart_score( + &self, + inventory: &Inventory, + topology: &T, + config: &KubePrometheusConfig, + ) -> Result { + let helm_chart = kube_prometheus_helm_chart_score(config); + helm_chart + .create_interpret() + .execute(inventory, topology) + .await + } + + async fn deploy_alert_channel_service( + &self, + inventory: &Inventory, + topology: &T, + config: &KubePrometheusConfig, + ) -> Result { + match &self.score.alert_channel { + Some(AlertChannel::WebHookUrl { + webhook_service_type, + .. + }) => match webhook_service_type { + WebhookServiceType::Discord => { + discord_alert_manager_score(config) + .create_interpret() + .execute(inventory, topology) + .await + } + WebhookServiceType::Slack => Ok(Outcome::success( + "No extra configs for slack alerting".to_string(), + )), + }, + Some(AlertChannel::Smpt { .. }) => { + todo!() + } + None => Ok(Outcome::success("No alert channel configured".to_string())), + } + } +} + +#[async_trait] +impl Interpret for MonitoringAlertingStackInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let config = self.build_kube_prometheus_helm_chart_config().await; + info!("Built kube prometheus config"); + info!("Installing kube prometheus chart"); + self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config) + .await?; + info!("Installing alert channel service"); + self.deploy_alert_channel_service(inventory, topology, &config).await?; + Ok(Outcome::success(format!( + "succesfully deployed monitoring and alerting stack" + ))) + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +}