From b9e208f4cf36dc6ebbf036ceef7a7fba253075a0 Mon Sep 17 00:00:00 2001 From: Willem Date: Tue, 22 Jul 2025 13:26:03 -0400 Subject: [PATCH] feat: added default prometheus rules and grafana dashboard for application monitoring --- .../application/features/monitoring.rs | 24 +-- ...=> crd_application_monitoring_alerting.rs} | 55 ++--- .../monitoring/application_monitoring/mod.rs | 2 +- .../kube_prometheus/crd/crd_default_rules.rs | 58 +++-- .../kube_prometheus/crd/crd_grafana.rs | 2 - .../crd/crd_prometheus_rules.rs | 21 +- .../crd/grafana_default_dashboard.rs | 203 ++++++++++++++++++ .../monitoring/kube_prometheus/crd/mod.rs | 1 + .../monitoring/ntfy/helm/ntfy_helm_chart.rs | 1 + .../prometheus/alerts/k8s/deployment.rs | 23 ++ .../prometheus/alerts/k8s/memory_usage.rs | 37 ++++ .../src/modules/prometheus/alerts/k8s/mod.rs | 4 + .../src/modules/prometheus/alerts/k8s/pod.rs | 55 +++++ .../modules/prometheus/alerts/k8s/service.rs | 19 ++ 14 files changed, 400 insertions(+), 105 deletions(-) rename harmony/src/modules/monitoring/application_monitoring/{helm_prometheus_application_alerting.rs => crd_application_monitoring_alerting.rs} (92%) create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs create mode 100644 harmony/src/modules/prometheus/alerts/k8s/deployment.rs create mode 100644 harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs create mode 100644 harmony/src/modules/prometheus/alerts/k8s/pod.rs create mode 100644 harmony/src/modules/prometheus/alerts/k8s/service.rs diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index 8c01b2d..a700c7c 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -1,10 +1,8 @@ use std::sync::Arc; -use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore; +use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver; -use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{ - build_rule_container_restarting, build_rule_pod_failed, -}; +use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup; use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ ServiceMonitor, ServiceMonitorSpec, @@ -39,7 +37,7 @@ impl Result<(), String> { info!("Ensuring monitoring is available for application"); let namespace = self.application.name().clone(); - let mut alerting_score = HelmPrometheusApplicationAlertingScore { + let mut alerting_score = CRDApplicationAlertingScore { namespace: namespace.clone(), receivers: self.alert_receiver.clone(), service_monitors: self.service_monitors.clone(), @@ -92,9 +90,7 @@ impl>, pub service_monitors: Vec, pub prometheus_rules: Vec, } -impl Score for HelmPrometheusApplicationAlertingScore { +impl Score for CRDApplicationAlertingScore { fn create_interpret(&self) -> Box> { - Box::new(HelmPrometheusApplicationAlertingInterpret { + Box::new(CRDApplicationAlertingInterpret { namespace: self.namespace.clone(), receivers: self.receivers.clone(), service_monitors: self.service_monitors.clone(), @@ -55,12 +56,12 @@ impl Score for HelmPrometheusApplicationAlertingScor } fn name(&self) -> String { - "HelmPrometheusApplicationAlertingScore".into() + "CRDApplicationAlertingScore".into() } } #[derive(Clone, Debug)] -pub struct HelmPrometheusApplicationAlertingInterpret { +pub struct CRDApplicationAlertingInterpret { pub namespace: String, pub receivers: Vec>, pub service_monitors: Vec, @@ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret { } #[async_trait] -impl Interpret for HelmPrometheusApplicationAlertingInterpret { +impl Interpret for CRDApplicationAlertingInterpret { async fn execute( &self, _inventory: &Inventory, @@ -85,7 +86,7 @@ impl Interpret for HelmPrometheusApplicationAlerting self.install_monitors(self.service_monitors.clone(), &client) .await?; Ok(Outcome::success(format!( - "deployed application monitoring composants channels" + "deployed application monitoring composants" ))) } @@ -106,7 +107,7 @@ impl Interpret for HelmPrometheusApplicationAlerting } } -impl HelmPrometheusApplicationAlertingInterpret { +impl CRDApplicationAlertingInterpret { async fn crd_exists(&self, crd: &str) -> bool { let output = Command::new("kubectl") .args(["get", "crd", crd]) @@ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret { json_data.insert("timeInterval".to_string(), "5s".to_string()); let namespace = self.namespace.clone(); - let json = format!( - r#"{{ - "title": "UP Status Dashboard", - "timezone": "browser", - "panels": [ - {{ - "type": "table", - "title": "Service UP Status", - "gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }}, - "targets": [ - {{ - "expr": "up{{namespace=\"{namespace}\"}}", - "format": "table", - "refId": "A" - }} - ], - "options": {{ - "showHeader": true - }}, - "fieldConfig": {{ - "defaults": {{ - "custom": {{}} - }}, - "overrides": [] - }} - }} - ], - "schemaVersion": 30, - "version": 1 - }}"# - ); + let json = build_default_dashboard(&namespace); let graf_data_source = GrafanaDatasource { metadata: ObjectMeta { - name: Some(self.namespace.clone()), + name: Some(format!("grafana-datasource-{}", self.namespace.clone())), namespace: Some(self.namespace.clone()), ..Default::default() }, @@ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret { let graf_dashboard = GrafanaDashboard { metadata: ObjectMeta { - name: Some(self.namespace.clone()), + name: Some(format!("grafana-dashboard-{}", self.namespace.clone())), namespace: Some(self.namespace.clone()), ..Default::default() }, @@ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret { let grafana = Grafana { metadata: ObjectMeta { - name: Some(self.namespace.clone()), + name: Some(format!("grafana-{}", self.namespace.clone())), namespace: Some(self.namespace.clone()), labels: Some(label.clone()), ..Default::default() diff --git a/harmony/src/modules/monitoring/application_monitoring/mod.rs b/harmony/src/modules/monitoring/application_monitoring/mod.rs index 6274032..77364ed 100644 --- a/harmony/src/modules/monitoring/application_monitoring/mod.rs +++ b/harmony/src/modules/monitoring/application_monitoring/mod.rs @@ -1,2 +1,2 @@ -pub mod helm_prometheus_application_alerting; +pub mod crd_application_monitoring_alerting; pub mod k8s_application_monitoring_score; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs index 1feb5ed..a245a86 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs @@ -1,38 +1,30 @@ use std::collections::BTreeMap; +use crate::modules::{ + monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule, + prometheus::alerts::k8s::{ + deployment::alert_deployment_unavailable, + pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, + pvc::high_pvc_fill_rate_over_two_days, + service::alert_service_down, + }, +}; + use super::crd_prometheus_rules::Rule; -pub fn build_rule_container_restarting() -> Rule { - Rule { - alert: Some("ContainerRestarting".into()), - expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()), - for_: Some("5m".into()), - labels: Some(BTreeMap::from([("severity".into(), "warning".into())])), - annotations: Some(BTreeMap::from([ - ( - "summary".into(), - "Container is restarting frequently".into(), - ), - ( - "description".into(), - "Container in this namespace is restarting more than 3 times in 5 minutes.".into(), - ), - ])), - } -} - -pub fn build_rule_pod_failed() -> Rule { - Rule { - alert: Some("PodFailed".into()), - expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()), - for_: Some("0m".into()), - labels: Some(BTreeMap::from([("severity".into(), "critical".into())])), - annotations: Some(BTreeMap::from([ - ("summary".into(), "A pod has failed".into()), - ( - "description".into(), - "One or more pods are in Failed phase.".into(), - ), - ])), - } +pub fn build_default_application_rules() -> Vec { + let pod_failed: Rule = pod_failed().into(); + let container_restarting: Rule = alert_container_restarting().into(); + let pod_not_ready: Rule = alert_pod_not_ready().into(); + let service_down: Rule = alert_service_down().into(); + let deployment_unavailable: Rule = alert_deployment_unavailable().into(); + let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into(); + vec![ + pod_failed, + container_restarting, + pod_not_ready, + service_down, + deployment_unavailable, + high_pvc_fill_rate, + ] } diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index 74e76fe..793f639 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -4,8 +4,6 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::modules::monitoring::kube_prometheus::types::Operator; - use super::crd_prometheuses::LabelSelector; #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs index 49bee3b..c0ee69e 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs @@ -1,8 +1,12 @@ +use std::collections::BTreeMap; + use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed}; +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +use super::crd_default_rules::build_default_application_rules; #[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] #[kube( @@ -42,13 +46,14 @@ pub struct Rule { pub annotations: Option>, } -impl PrometheusRuleSpec { - pub fn with_default_rules() -> Self { - PrometheusRuleSpec { - groups: vec![RuleGroup { - name: "default.rules".into(), - rules: vec![build_rule_container_restarting(), build_rule_pod_failed()], - }], +impl From for Rule { + fn from(value: PrometheusAlertRule) -> Self { + Rule { + alert: Some(value.alert), + expr: Some(value.expr), + for_: value.r#for, + labels: Some(value.labels.into_iter().collect::>()), + annotations: Some(value.annotations.into_iter().collect::>()), } } } diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs new file mode 100644 index 0000000..63fffa9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs @@ -0,0 +1,203 @@ +pub fn build_default_dashboard(namespace: &str) -> String { + let dashboard = format!( + r#"{{ + "annotations": {{ + "list": [] + }}, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 171105, + "panels": [ + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 6, + "x": 0, + "y": 0 + }}, + "id": 1, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Pods in Namespace", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 6, + "x": 6, + "y": 0 + }}, + "id": 2, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Pods in Failed State", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "percentunit" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 6 + }}, + "id": 3, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Deployment Health (Available / Desired)", + "type": "stat" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }}, + "id": 4, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))", + "legendFormat": "{{{{pod}}}}", + "refId": "A" + }} + ], + "title": "Container Restarts (per pod)", + "type": "timeseries" + }}, + {{ + "datasource": "$datasource", + "fieldConfig": {{ + "defaults": {{ + "unit": "short" + }}, + "overrides": [] + }}, + "gridPos": {{ + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }}, + "id": 5, + "options": {{ + "reduceOptions": {{ + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }} + }}, + "pluginVersion": "9.0.0", + "targets": [ + {{ + "expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)", + "legendFormat": "", + "refId": "A" + }} + ], + "title": "Firing Alerts in Namespace", + "type": "stat" + }} + ], + "schemaVersion": 36, + "templating": {{ + "list": [ + {{ + "name": "datasource", + "type": "datasource", + "pluginId": "prometheus", + "label": "Prometheus", + "query": "prometheus", + "refresh": 1, + "hide": 0, + "current": {{ + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }} + }} + ] + }}, + "title": "Tenant Namespace Overview", + "version": 1 +}}"# + ); + dashboard +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs index 85ddf35..236a2de 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs @@ -4,6 +4,7 @@ pub mod crd_default_rules; pub mod crd_grafana; pub mod crd_prometheus_rules; pub mod crd_prometheuses; +pub mod grafana_default_dashboard; pub mod grafana_operator; pub mod prometheus_operator; pub mod role; diff --git a/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs b/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs index 076a8a3..d94a78d 100644 --- a/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs +++ b/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs @@ -58,6 +58,7 @@ config: # web-root: "disable" enable-signup: false enable-login: "true" + enable-metrics: "true" persistence: enabled: true diff --git a/harmony/src/modules/prometheus/alerts/k8s/deployment.rs b/harmony/src/modules/prometheus/alerts/k8s/deployment.rs new file mode 100644 index 0000000..6e30f5f --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/deployment.rs @@ -0,0 +1,23 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_deployment_unavailable() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "DeploymentUnavailable".into(), + expr: "kube_deployment_status_replicas_unavailable > 0".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ( + "summary".into(), + "Deployment has unavailable replicas".into(), + ), + ( + "description".into(), + "A deployment in this namespace has unavailable replicas for over 2 minutes." + .into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs b/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs new file mode 100644 index 0000000..11d65c9 --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs @@ -0,0 +1,37 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_high_memory_usage() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "HighMemoryUsage".into(), + expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000" + .into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is using high memory".into()), + ( + "description".into(), + "A pod is consuming more than 500Mi of memory.".into(), + ), + ]), + } +} + +pub fn alert_high_cpu_usage() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "HighCPUUsage".into(), + expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9" + .into(), + r#for: Some("1m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is using high CPU".into()), + ( + "description".into(), + "A pod is using more than 90% of a core over 1 minute.".into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/mod.rs b/harmony/src/modules/prometheus/alerts/k8s/mod.rs index f01a9c8..0e3314b 100644 --- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs +++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs @@ -1 +1,5 @@ +pub mod deployment; +pub mod memory_usage; +pub mod pod; pub mod pvc; +pub mod service; diff --git a/harmony/src/modules/prometheus/alerts/k8s/pod.rs b/harmony/src/modules/prometheus/alerts/k8s/pod.rs new file mode 100644 index 0000000..152ec2f --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/pod.rs @@ -0,0 +1,55 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn pod_failed() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "PodFailed".into(), + expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "critical".into())]), + annotations: HashMap::from([ + ("summary".into(), "A pod has failed".into()), + ( + "description".into(), + "One or more pods are in Failed phase.".into(), + ), + ]), + } +} + +pub fn alert_container_restarting() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "ContainerRestarting".into(), + expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(), + r#for: Some("5m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ( + "summary".into(), + "Container is restarting frequently".into(), + ), + ( + "description".into(), + "A container in this namespace has restarted more than 3 times in 5 minutes." + .into(), + ), + ]), + } +} + +pub fn alert_pod_not_ready() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "PodNotReady".into(), + expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(), + r#for: Some("2m".into()), + labels: HashMap::from([("severity".into(), "warning".into())]), + annotations: HashMap::from([ + ("summary".into(), "Pod is not ready".into()), + ( + "description".into(), + "A pod in the namespace is not reporting Ready status.".into(), + ), + ]), + } +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/service.rs b/harmony/src/modules/prometheus/alerts/k8s/service.rs new file mode 100644 index 0000000..5a56761 --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/service.rs @@ -0,0 +1,19 @@ +use std::collections::HashMap; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn alert_service_down() -> PrometheusAlertRule { + PrometheusAlertRule { + alert: "ServiceDown".into(), + expr: "up == 0".into(), + r#for: Some("1m".into()), + labels: HashMap::from([("severity".into(), "critical".into())]), + annotations: HashMap::from([ + ("summary".into(), "Service is down".into()), + ( + "description".into(), + "A target service in the namespace is not responding to Prometheus scrapes.".into(), + ), + ]), + } +}