Monitor an application within a tenant #86
| @ -1,10 +1,8 @@ | |||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
| 
 | 
 | ||||||
| use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore; | use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore; | ||||||
| use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver; | use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver; | ||||||
| use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{ | use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules; | ||||||
|     build_rule_container_restarting, build_rule_pod_failed, |  | ||||||
| }; |  | ||||||
| use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup; | use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup; | ||||||
| use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ | use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ | ||||||
|     ServiceMonitor, ServiceMonitorSpec, |     ServiceMonitor, ServiceMonitorSpec, | ||||||
| @ -39,7 +37,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt: | |||||||
|     async fn ensure_installed(&self, topology: &T) -> Result<(), String> { |     async fn ensure_installed(&self, topology: &T) -> Result<(), String> { | ||||||
|         info!("Ensuring monitoring is available for application"); |         info!("Ensuring monitoring is available for application"); | ||||||
|         let namespace = self.application.name().clone(); |         let namespace = self.application.name().clone(); | ||||||
|         let mut alerting_score = HelmPrometheusApplicationAlertingScore { |         let mut alerting_score = CRDApplicationAlertingScore { | ||||||
|             namespace: namespace.clone(), |             namespace: namespace.clone(), | ||||||
|             receivers: self.alert_receiver.clone(), |             receivers: self.alert_receiver.clone(), | ||||||
|             service_monitors: self.service_monitors.clone(), |             service_monitors: self.service_monitors.clone(), | ||||||
| @ -92,9 +90,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt: | |||||||
| 
 | 
 | ||||||
|         alerting_score.receivers.push(Box::new(ntfy_receiver)); |         alerting_score.receivers.push(Box::new(ntfy_receiver)); | ||||||
| 
 | 
 | ||||||
|         //TODO add service monitors to PrometheusApplicationMonitoring which can be
 |         let service_monitor = ServiceMonitor { | ||||||
|         //deployed for the namespace using prometheus crd-servicemonitors
 |  | ||||||
|         let mut service_monitor = ServiceMonitor { |  | ||||||
|             metadata: ObjectMeta { |             metadata: ObjectMeta { | ||||||
|                 name: Some(self.application.name().clone()), |                 name: Some(self.application.name().clone()), | ||||||
|                 labels: Some(std::collections::BTreeMap::from([ |                 labels: Some(std::collections::BTreeMap::from([ | ||||||
| @ -110,22 +106,12 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt: | |||||||
|             }, |             }, | ||||||
|             spec: ServiceMonitorSpec::default(), |             spec: ServiceMonitorSpec::default(), | ||||||
|         }; |         }; | ||||||
|         let service_mon_endpoint = ServiceMonitorEndpoint { |  | ||||||
|             port: Some("http".into()), |  | ||||||
|             interval: Some("30s".into()), |  | ||||||
|             path: Some("/metrics".into()), |  | ||||||
|             scheme: None, |  | ||||||
|             relabelings: vec![], |  | ||||||
|             metric_relabelings: vec![], |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         service_monitor.spec.endpoints.push(service_mon_endpoint); |  | ||||||
| 
 | 
 | ||||||
|         alerting_score.service_monitors.push(service_monitor); |         alerting_score.service_monitors.push(service_monitor); | ||||||
| 
 | 
 | ||||||
|         let rules_group = RuleGroup { |         let rules_group = RuleGroup { | ||||||
|             name: format!("{}-rules", self.application.name().clone()), |             name: format!("{}-rules", self.application.name().clone()), | ||||||
|             rules: vec![build_rule_container_restarting(), build_rule_pod_failed()], |             rules: build_default_application_rules(), | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         alerting_score.prometheus_rules.push(rules_group); |         alerting_score.prometheus_rules.push(rules_group); | ||||||
|  | |||||||
| @ -19,6 +19,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{ | |||||||
| use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ | use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ | ||||||
|     PrometheusRule, PrometheusRuleSpec, RuleGroup, |     PrometheusRule, PrometheusRuleSpec, RuleGroup, | ||||||
| }; | }; | ||||||
|  | use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard; | ||||||
| use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor; | use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor; | ||||||
| use crate::topology::{K8sclient, Topology, k8s::K8sClient}; | use crate::topology::{K8sclient, Topology, k8s::K8sClient}; | ||||||
| use crate::{ | use crate::{ | ||||||
| @ -37,16 +38,16 @@ use crate::{ | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Clone, Debug, Serialize)] | #[derive(Clone, Debug, Serialize)] | ||||||
| pub struct HelmPrometheusApplicationAlertingScore { | pub struct CRDApplicationAlertingScore { | ||||||
|     pub namespace: String, |     pub namespace: String, | ||||||
|     pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>, |     pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>, | ||||||
|     pub service_monitors: Vec<ServiceMonitor>, |     pub service_monitors: Vec<ServiceMonitor>, | ||||||
|     pub prometheus_rules: Vec<RuleGroup>, |     pub prometheus_rules: Vec<RuleGroup>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScore { | impl<T: Topology + K8sclient> Score<T> for CRDApplicationAlertingScore { | ||||||
|     fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> { |     fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> { | ||||||
|         Box::new(HelmPrometheusApplicationAlertingInterpret { |         Box::new(CRDApplicationAlertingInterpret { | ||||||
|             namespace: self.namespace.clone(), |             namespace: self.namespace.clone(), | ||||||
|             receivers: self.receivers.clone(), |             receivers: self.receivers.clone(), | ||||||
|             service_monitors: self.service_monitors.clone(), |             service_monitors: self.service_monitors.clone(), | ||||||
| @ -55,12 +56,12 @@ impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScor | |||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     fn name(&self) -> String { |     fn name(&self) -> String { | ||||||
|         "HelmPrometheusApplicationAlertingScore".into() |         "CRDApplicationAlertingScore".into() | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[derive(Clone, Debug)] | #[derive(Clone, Debug)] | ||||||
| pub struct HelmPrometheusApplicationAlertingInterpret { | pub struct CRDApplicationAlertingInterpret { | ||||||
|     pub namespace: String, |     pub namespace: String, | ||||||
|     pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>, |     pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>, | ||||||
|     pub service_monitors: Vec<ServiceMonitor>, |     pub service_monitors: Vec<ServiceMonitor>, | ||||||
| @ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlertingInterpret { | impl<T: Topology + K8sclient> Interpret<T> for CRDApplicationAlertingInterpret { | ||||||
|     async fn execute( |     async fn execute( | ||||||
|         &self, |         &self, | ||||||
|         _inventory: &Inventory, |         _inventory: &Inventory, | ||||||
| @ -85,7 +86,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting | |||||||
|         self.install_monitors(self.service_monitors.clone(), &client) |         self.install_monitors(self.service_monitors.clone(), &client) | ||||||
|             .await?; |             .await?; | ||||||
|         Ok(Outcome::success(format!( |         Ok(Outcome::success(format!( | ||||||
|             "deployed application monitoring composants channels" |             "deployed application monitoring composants" | ||||||
|         ))) |         ))) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -106,7 +107,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| impl HelmPrometheusApplicationAlertingInterpret { | impl CRDApplicationAlertingInterpret { | ||||||
|     async fn crd_exists(&self, crd: &str) -> bool { |     async fn crd_exists(&self, crd: &str) -> bool { | ||||||
|         let output = Command::new("kubectl") |         let output = Command::new("kubectl") | ||||||
|             .args(["get", "crd", crd]) |             .args(["get", "crd", crd]) | ||||||
| @ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret { | |||||||
|         json_data.insert("timeInterval".to_string(), "5s".to_string()); |         json_data.insert("timeInterval".to_string(), "5s".to_string()); | ||||||
|         let namespace = self.namespace.clone(); |         let namespace = self.namespace.clone(); | ||||||
| 
 | 
 | ||||||
|         let json = format!( |         let json = build_default_dashboard(&namespace); | ||||||
|             r#"{{
 |  | ||||||
|       "title": "UP Status Dashboard", |  | ||||||
|       "timezone": "browser", |  | ||||||
|       "panels": [ |  | ||||||
|         {{ |  | ||||||
|           "type": "table", |  | ||||||
|           "title": "Service UP Status", |  | ||||||
|           "gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }}, |  | ||||||
|           "targets": [ |  | ||||||
|             {{ |  | ||||||
|               "expr": "up{{namespace=\"{namespace}\"}}", |  | ||||||
|               "format": "table", |  | ||||||
|               "refId": "A" |  | ||||||
|             }} |  | ||||||
|           ], |  | ||||||
|           "options": {{ |  | ||||||
|             "showHeader": true |  | ||||||
|           }}, |  | ||||||
|           "fieldConfig": {{ |  | ||||||
|             "defaults": {{ |  | ||||||
|               "custom": {{}} |  | ||||||
|             }}, |  | ||||||
|             "overrides": [] |  | ||||||
|           }} |  | ||||||
|         }} |  | ||||||
|       ], |  | ||||||
|       "schemaVersion": 30, |  | ||||||
|       "version": 1 |  | ||||||
|     }}"#
 |  | ||||||
|         ); |  | ||||||
| 
 | 
 | ||||||
|         let graf_data_source = GrafanaDatasource { |         let graf_data_source = GrafanaDatasource { | ||||||
|             metadata: ObjectMeta { |             metadata: ObjectMeta { | ||||||
|                 name: Some(self.namespace.clone()), |                 name: Some(format!("grafana-datasource-{}", self.namespace.clone())), | ||||||
|                 namespace: Some(self.namespace.clone()), |                 namespace: Some(self.namespace.clone()), | ||||||
|                 ..Default::default() |                 ..Default::default() | ||||||
|             }, |             }, | ||||||
| @ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret { | |||||||
| 
 | 
 | ||||||
|         let graf_dashboard = GrafanaDashboard { |         let graf_dashboard = GrafanaDashboard { | ||||||
|             metadata: ObjectMeta { |             metadata: ObjectMeta { | ||||||
|                 name: Some(self.namespace.clone()), |                 name: Some(format!("grafana-dashboard-{}", self.namespace.clone())), | ||||||
|                 namespace: Some(self.namespace.clone()), |                 namespace: Some(self.namespace.clone()), | ||||||
|                 ..Default::default() |                 ..Default::default() | ||||||
|             }, |             }, | ||||||
| @ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret { | |||||||
| 
 | 
 | ||||||
|         let grafana = Grafana { |         let grafana = Grafana { | ||||||
|             metadata: ObjectMeta { |             metadata: ObjectMeta { | ||||||
|                 name: Some(self.namespace.clone()), |                 name: Some(format!("grafana-{}", self.namespace.clone())), | ||||||
|                 namespace: Some(self.namespace.clone()), |                 namespace: Some(self.namespace.clone()), | ||||||
|                 labels: Some(label.clone()), |                 labels: Some(label.clone()), | ||||||
|                 ..Default::default() |                 ..Default::default() | ||||||
| @ -1,2 +1,2 @@ | |||||||
| pub mod helm_prometheus_application_alerting; | pub mod crd_application_monitoring_alerting; | ||||||
| pub mod k8s_application_monitoring_score; | pub mod k8s_application_monitoring_score; | ||||||
|  | |||||||
| @ -1,38 +1,30 @@ | |||||||
| use std::collections::BTreeMap; | use std::collections::BTreeMap; | ||||||
| 
 | 
 | ||||||
|  | use crate::modules::{ | ||||||
|  |     monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule, | ||||||
|  |     prometheus::alerts::k8s::{ | ||||||
|  |         deployment::alert_deployment_unavailable, | ||||||
|  |         pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, | ||||||
|  |         pvc::high_pvc_fill_rate_over_two_days, | ||||||
|  |         service::alert_service_down, | ||||||
|  |     }, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| use super::crd_prometheus_rules::Rule; | use super::crd_prometheus_rules::Rule; | ||||||
| 
 | 
 | ||||||
| pub fn build_rule_container_restarting() -> Rule { | pub fn build_default_application_rules() -> Vec<Rule> { | ||||||
|     Rule { |     let pod_failed: Rule = pod_failed().into(); | ||||||
|         alert: Some("ContainerRestarting".into()), |     let container_restarting: Rule = alert_container_restarting().into(); | ||||||
|         expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()), |     let pod_not_ready: Rule = alert_pod_not_ready().into(); | ||||||
|         for_: Some("5m".into()), |     let service_down: Rule = alert_service_down().into(); | ||||||
|         labels: Some(BTreeMap::from([("severity".into(), "warning".into())])), |     let deployment_unavailable: Rule = alert_deployment_unavailable().into(); | ||||||
|         annotations: Some(BTreeMap::from([ |     let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into(); | ||||||
|             ( |     vec![ | ||||||
|                 "summary".into(), |         pod_failed, | ||||||
|                 "Container is restarting frequently".into(), |         container_restarting, | ||||||
|             ), |         pod_not_ready, | ||||||
|             ( |         service_down, | ||||||
|                 "description".into(), |         deployment_unavailable, | ||||||
|                 "Container in this namespace is restarting more than 3 times in 5 minutes.".into(), |         high_pvc_fill_rate, | ||||||
|             ), |     ] | ||||||
|         ])), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| pub fn build_rule_pod_failed() -> Rule { |  | ||||||
|     Rule { |  | ||||||
|         alert: Some("PodFailed".into()), |  | ||||||
|         expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()), |  | ||||||
|         for_: Some("0m".into()), |  | ||||||
|         labels: Some(BTreeMap::from([("severity".into(), "critical".into())])), |  | ||||||
|         annotations: Some(BTreeMap::from([ |  | ||||||
|             ("summary".into(), "A pod has failed".into()), |  | ||||||
|             ( |  | ||||||
|                 "description".into(), |  | ||||||
|                 "One or more pods are in Failed phase.".into(), |  | ||||||
|             ), |  | ||||||
|         ])), |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  | |||||||
| @ -4,8 +4,6 @@ use kube::CustomResource; | |||||||
| use schemars::JsonSchema; | use schemars::JsonSchema; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| 
 | 
 | ||||||
| use crate::modules::monitoring::kube_prometheus::types::Operator; |  | ||||||
| 
 |  | ||||||
| use super::crd_prometheuses::LabelSelector; | use super::crd_prometheuses::LabelSelector; | ||||||
| 
 | 
 | ||||||
| #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] | #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] | ||||||
|  | |||||||
| @ -1,8 +1,12 @@ | |||||||
|  | use std::collections::BTreeMap; | ||||||
|  | 
 | ||||||
| use kube::CustomResource; | use kube::CustomResource; | ||||||
| use schemars::JsonSchema; | use schemars::JsonSchema; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| 
 | 
 | ||||||
| use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed}; | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | use super::crd_default_rules::build_default_application_rules; | ||||||
| 
 | 
 | ||||||
| #[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] | #[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] | ||||||
| #[kube(
 | #[kube(
 | ||||||
| @ -42,13 +46,14 @@ pub struct Rule { | |||||||
|     pub annotations: Option<std::collections::BTreeMap<String, String>>, |     pub annotations: Option<std::collections::BTreeMap<String, String>>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| impl PrometheusRuleSpec { | impl From<PrometheusAlertRule> for Rule { | ||||||
|     pub fn with_default_rules() -> Self { |     fn from(value: PrometheusAlertRule) -> Self { | ||||||
|         PrometheusRuleSpec { |         Rule { | ||||||
|             groups: vec![RuleGroup { |             alert: Some(value.alert), | ||||||
|                 name: "default.rules".into(), |             expr: Some(value.expr), | ||||||
|                 rules: vec![build_rule_container_restarting(), build_rule_pod_failed()], |             for_: value.r#for, | ||||||
|             }], |             labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()), | ||||||
|  |             annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | |||||||
| @ -0,0 +1,203 @@ | |||||||
|  | pub fn build_default_dashboard(namespace: &str) -> String { | ||||||
|  |     let dashboard = format!( | ||||||
|  |         r#"{{
 | ||||||
|  |   "annotations": {{ | ||||||
|  |     "list": [] | ||||||
|  |   }}, | ||||||
|  |   "editable": true, | ||||||
|  |   "gnetId": null, | ||||||
|  |   "graphTooltip": 0, | ||||||
|  |   "id": null, | ||||||
|  |   "iteration": 171105, | ||||||
|  |   "panels": [ | ||||||
|  |     {{ | ||||||
|  |       "datasource": "$datasource", | ||||||
|  |       "fieldConfig": {{ | ||||||
|  |         "defaults": {{ | ||||||
|  |           "unit": "short" | ||||||
|  |         }}, | ||||||
|  |         "overrides": [] | ||||||
|  |       }}, | ||||||
|  |       "gridPos": {{ | ||||||
|  |         "h": 6, | ||||||
|  |         "w": 6, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 0 | ||||||
|  |       }}, | ||||||
|  |       "id": 1, | ||||||
|  |       "options": {{ | ||||||
|  |         "reduceOptions": {{ | ||||||
|  |           "calcs": ["lastNotNull"], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }} | ||||||
|  |       }}, | ||||||
|  |       "pluginVersion": "9.0.0", | ||||||
|  |       "targets": [ | ||||||
|  |         {{ | ||||||
|  |           "expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\",  phase=\"Running\"}})", | ||||||
|  |           "legendFormat": "", | ||||||
|  |           "refId": "A" | ||||||
|  |         }} | ||||||
|  |       ], | ||||||
|  |       "title": "Pods in Namespace", | ||||||
|  |       "type": "stat" | ||||||
|  |     }}, | ||||||
|  |     {{ | ||||||
|  |       "datasource": "$datasource", | ||||||
|  |       "fieldConfig": {{ | ||||||
|  |         "defaults": {{ | ||||||
|  |           "unit": "short" | ||||||
|  |         }}, | ||||||
|  |         "overrides": [] | ||||||
|  |       }}, | ||||||
|  |       "gridPos": {{ | ||||||
|  |         "h": 6, | ||||||
|  |         "w": 6, | ||||||
|  |         "x": 6, | ||||||
|  |         "y": 0 | ||||||
|  |       }}, | ||||||
|  |       "id": 2, | ||||||
|  |       "options": {{ | ||||||
|  |         "reduceOptions": {{ | ||||||
|  |           "calcs": ["lastNotNull"], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }} | ||||||
|  |       }}, | ||||||
|  |       "pluginVersion": "9.0.0", | ||||||
|  |       "targets": [ | ||||||
|  |         {{ | ||||||
|  |           "expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})", | ||||||
|  |           "legendFormat": "", | ||||||
|  |           "refId": "A" | ||||||
|  |         }} | ||||||
|  |       ], | ||||||
|  |       "title": "Pods in Failed State", | ||||||
|  |       "type": "stat" | ||||||
|  |     }}, | ||||||
|  |     {{ | ||||||
|  |       "datasource": "$datasource", | ||||||
|  |       "fieldConfig": {{ | ||||||
|  |         "defaults": {{ | ||||||
|  |           "unit": "percentunit" | ||||||
|  |         }}, | ||||||
|  |         "overrides": [] | ||||||
|  |       }}, | ||||||
|  |       "gridPos": {{ | ||||||
|  |         "h": 6, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 6 | ||||||
|  |       }}, | ||||||
|  |       "id": 3, | ||||||
|  |       "options": {{ | ||||||
|  |         "reduceOptions": {{ | ||||||
|  |           "calcs": ["lastNotNull"], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }} | ||||||
|  |       }}, | ||||||
|  |       "pluginVersion": "9.0.0", | ||||||
|  |       "targets": [ | ||||||
|  |         {{ | ||||||
|  |           "expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})", | ||||||
|  |           "legendFormat": "", | ||||||
|  |           "refId": "A" | ||||||
|  |         }} | ||||||
|  |       ], | ||||||
|  |       "title": "Deployment Health (Available / Desired)", | ||||||
|  |       "type": "stat" | ||||||
|  |     }}, | ||||||
|  |     {{ | ||||||
|  |       "datasource": "$datasource", | ||||||
|  |       "fieldConfig": {{ | ||||||
|  |         "defaults": {{ | ||||||
|  |           "unit": "short" | ||||||
|  |         }}, | ||||||
|  |         "overrides": [] | ||||||
|  |       }}, | ||||||
|  |       "gridPos": {{ | ||||||
|  |         "h": 6, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 12 | ||||||
|  |       }}, | ||||||
|  |       "id": 4, | ||||||
|  |       "options": {{ | ||||||
|  |         "reduceOptions": {{ | ||||||
|  |           "calcs": ["lastNotNull"], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }} | ||||||
|  |       }}, | ||||||
|  |       "pluginVersion": "9.0.0", | ||||||
|  |       "targets": [ | ||||||
|  |         {{ | ||||||
|  |           "expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))", | ||||||
|  |           "legendFormat": "{{{{pod}}}}", | ||||||
|  |           "refId": "A" | ||||||
|  |         }} | ||||||
|  |       ], | ||||||
|  |       "title": "Container Restarts (per pod)", | ||||||
|  |       "type": "timeseries" | ||||||
|  |     }}, | ||||||
|  |     {{ | ||||||
|  |       "datasource": "$datasource", | ||||||
|  |       "fieldConfig": {{ | ||||||
|  |         "defaults": {{ | ||||||
|  |           "unit": "short" | ||||||
|  |         }}, | ||||||
|  |         "overrides": [] | ||||||
|  |       }}, | ||||||
|  |       "gridPos": {{ | ||||||
|  |         "h": 6, | ||||||
|  |         "w": 12, | ||||||
|  |         "x": 0, | ||||||
|  |         "y": 18 | ||||||
|  |       }}, | ||||||
|  |       "id": 5, | ||||||
|  |       "options": {{ | ||||||
|  |         "reduceOptions": {{ | ||||||
|  |           "calcs": ["lastNotNull"], | ||||||
|  |           "fields": "", | ||||||
|  |           "values": false | ||||||
|  |         }} | ||||||
|  |       }}, | ||||||
|  |       "pluginVersion": "9.0.0", | ||||||
|  |       "targets": [ | ||||||
|  |         {{ | ||||||
|  |           "expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)", | ||||||
|  |           "legendFormat": "", | ||||||
|  |           "refId": "A" | ||||||
|  |         }} | ||||||
|  |       ], | ||||||
|  |       "title": "Firing Alerts in Namespace", | ||||||
|  |       "type": "stat" | ||||||
|  |     }} | ||||||
|  |   ], | ||||||
|  |   "schemaVersion": 36, | ||||||
|  |   "templating": {{ | ||||||
|  |     "list": [ | ||||||
|  |       {{ | ||||||
|  |         "name": "datasource", | ||||||
|  |         "type": "datasource", | ||||||
|  |         "pluginId": "prometheus", | ||||||
|  |         "label": "Prometheus", | ||||||
|  |         "query": "prometheus", | ||||||
|  |         "refresh": 1, | ||||||
|  |         "hide": 0, | ||||||
|  |         "current": {{ | ||||||
|  |           "selected": true, | ||||||
|  |           "text": "Prometheus", | ||||||
|  |           "value": "Prometheus" | ||||||
|  |         }} | ||||||
|  |       }} | ||||||
|  |     ] | ||||||
|  |   }}, | ||||||
|  |   "title": "Tenant Namespace Overview", | ||||||
|  |   "version": 1 | ||||||
|  | }}"#
 | ||||||
|  |     ); | ||||||
|  |     dashboard | ||||||
|  | } | ||||||
| @ -4,6 +4,7 @@ pub mod crd_default_rules; | |||||||
| pub mod crd_grafana; | pub mod crd_grafana; | ||||||
| pub mod crd_prometheus_rules; | pub mod crd_prometheus_rules; | ||||||
| pub mod crd_prometheuses; | pub mod crd_prometheuses; | ||||||
|  | pub mod grafana_default_dashboard; | ||||||
| pub mod grafana_operator; | pub mod grafana_operator; | ||||||
| pub mod prometheus_operator; | pub mod prometheus_operator; | ||||||
| pub mod role; | pub mod role; | ||||||
|  | |||||||
| @ -58,6 +58,7 @@ config: | |||||||
|     # web-root: "disable" |     # web-root: "disable" | ||||||
|     enable-signup: false |     enable-signup: false | ||||||
|     enable-login: "true" |     enable-login: "true" | ||||||
|  |     enable-metrics: "true" | ||||||
| 
 | 
 | ||||||
| persistence: | persistence: | ||||||
|   enabled: true |   enabled: true | ||||||
|  | |||||||
							
								
								
									
										23
									
								
								harmony/src/modules/prometheus/alerts/k8s/deployment.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								harmony/src/modules/prometheus/alerts/k8s/deployment.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | 
 | ||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn alert_deployment_unavailable() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "DeploymentUnavailable".into(), | ||||||
|  |         expr: "kube_deployment_status_replicas_unavailable > 0".into(), | ||||||
|  |         r#for: Some("2m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "warning".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ( | ||||||
|  |                 "summary".into(), | ||||||
|  |                 "Deployment has unavailable replicas".into(), | ||||||
|  |             ), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A deployment in this namespace has unavailable replicas for over 2 minutes." | ||||||
|  |                     .into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										37
									
								
								harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | 
 | ||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn alert_high_memory_usage() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "HighMemoryUsage".into(), | ||||||
|  |         expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000" | ||||||
|  |             .into(), | ||||||
|  |         r#for: Some("2m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "warning".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ("summary".into(), "Pod is using high memory".into()), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A pod is consuming more than 500Mi of memory.".into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn alert_high_cpu_usage() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "HighCPUUsage".into(), | ||||||
|  |         expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9" | ||||||
|  |             .into(), | ||||||
|  |         r#for: Some("1m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "warning".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ("summary".into(), "Pod is using high CPU".into()), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A pod is using more than 90% of a core over 1 minute.".into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -1 +1,5 @@ | |||||||
|  | pub mod deployment; | ||||||
|  | pub mod memory_usage; | ||||||
|  | pub mod pod; | ||||||
| pub mod pvc; | pub mod pvc; | ||||||
|  | pub mod service; | ||||||
|  | |||||||
							
								
								
									
										55
									
								
								harmony/src/modules/prometheus/alerts/k8s/pod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								harmony/src/modules/prometheus/alerts/k8s/pod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | 
 | ||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn pod_failed() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "PodFailed".into(), | ||||||
|  |         expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(), | ||||||
|  |         r#for: Some("2m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "critical".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ("summary".into(), "A pod has failed".into()), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "One or more pods are in Failed phase.".into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn alert_container_restarting() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "ContainerRestarting".into(), | ||||||
|  |         expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(), | ||||||
|  |         r#for: Some("5m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "warning".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ( | ||||||
|  |                 "summary".into(), | ||||||
|  |                 "Container is restarting frequently".into(), | ||||||
|  |             ), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A container in this namespace has restarted more than 3 times in 5 minutes." | ||||||
|  |                     .into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn alert_pod_not_ready() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "PodNotReady".into(), | ||||||
|  |         expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(), | ||||||
|  |         r#for: Some("2m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "warning".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ("summary".into(), "Pod is not ready".into()), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A pod in the namespace is not reporting Ready status.".into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										19
									
								
								harmony/src/modules/prometheus/alerts/k8s/service.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								harmony/src/modules/prometheus/alerts/k8s/service.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | 
 | ||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn alert_service_down() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule { | ||||||
|  |         alert: "ServiceDown".into(), | ||||||
|  |         expr: "up == 0".into(), | ||||||
|  |         r#for: Some("1m".into()), | ||||||
|  |         labels: HashMap::from([("severity".into(), "critical".into())]), | ||||||
|  |         annotations: HashMap::from([ | ||||||
|  |             ("summary".into(), "Service is down".into()), | ||||||
|  |             ( | ||||||
|  |                 "description".into(), | ||||||
|  |                 "A target service in the namespace is not responding to Prometheus scrapes.".into(), | ||||||
|  |             ), | ||||||
|  |         ]), | ||||||
|  |     } | ||||||
|  | } | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user