diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs index c44ce88..6a3fe81 100644 --- a/examples/rust/src/main.rs +++ b/examples/rust/src/main.rs @@ -3,9 +3,18 @@ use std::{path::PathBuf, sync::Arc}; use harmony::{ inventory::Inventory, maestro::Maestro, - modules::application::{ - ApplicationScore, RustWebFramework, RustWebapp, - features::{ContinuousDelivery, Monitoring}, + modules::{ + application::{ + ApplicationScore, RustWebFramework, RustWebapp, + features::{ContinuousDelivery, PrometheusMonitoring}, + }, + monitoring::{ + alert_channel::discord_alert_channel::DiscordWebhook, + alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, + }, + prometheus::alerts::k8s::{ + pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days, + }, }, topology::{K8sAnywhereTopology, Url}, }; @@ -20,12 +29,25 @@ async fn main() { framework: Some(RustWebFramework::Leptos), }); + let pod_failed = pod_in_failed_state(); + let pod_failed_2 = pod_in_failed_state(); + let pod_failed_3 = pod_in_failed_state(); + + let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]); + let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]); let app = ApplicationScore { features: vec![ - Box::new(ContinuousDelivery { + //Box::new(ContinuousDelivery { + // application: application.clone(), + //}), + Box::new(PrometheusMonitoring { application: application.clone(), + alert_receivers: vec![Box::new(DiscordWebhook { + name: "dummy-discord".to_string(), + url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), + })], + alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)], }), - Box::new(Monitoring {}), // TODO add monitoring, backups, multisite ha, etc ], application, diff --git a/harmony/src/domain/topology/oberservability/monitoring.rs b/harmony/src/domain/topology/oberservability/monitoring.rs index 6d60c7a..05b6621 100644 --- a/harmony/src/domain/topology/oberservability/monitoring.rs +++ b/harmony/src/domain/topology/oberservability/monitoring.rs @@ -1,3 +1,5 @@ +use std::any::Any; + use async_trait::async_trait; use log::debug; @@ -9,7 +11,7 @@ use crate::{ }; #[async_trait] -pub trait AlertSender: Send + Sync + std::fmt::Debug { +pub trait AlertSender: Any + Send + Sync + std::fmt::Debug { fn name(&self) -> String; } diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index dd16226..5d0f596 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -9,32 +9,36 @@ use crate::{ application::{Application, ApplicationFeature}, monitoring::{ application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore, - kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, + kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus, }, }, score::Score, - topology::{HelmCommand, Topology, tenant::TenantManager}, + topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology}, }; #[derive(Debug, Clone)] -pub struct Monitoring { +pub struct PrometheusMonitoring { pub application: Arc, + pub alert_receivers: Vec>>, + pub alert_rules: Vec>>, } #[async_trait] -impl ApplicationFeature for Monitoring { +impl ApplicationFeature for PrometheusMonitoring { async fn ensure_installed(&self, topology: &T) -> Result<(), String> { info!("Ensuring monitoring is available for application"); let ns = self.application.name(); let mut service_monitor = ServiceMonitor::default(); + service_monitor.name = ns.clone(); + service_monitor.namespace = ns.clone(); service_monitor.namespace_selector = Some(NamespaceSelector { any: true, match_names: vec![ns.clone()], }); let alerting_score = ApplicationPrometheusMonitoringScore { namespace: ns, - receivers: vec![], - rules: vec![], + receivers: self.alert_receivers.clone(), + rules: self.alert_rules.clone(), service_monitors: vec![service_monitor], }; diff --git a/harmony/src/modules/helm/chart.rs b/harmony/src/modules/helm/chart.rs index 309bd1e..e1518c2 100644 --- a/harmony/src/modules/helm/chart.rs +++ b/harmony/src/modules/helm/chart.rs @@ -220,7 +220,6 @@ impl Interpret for HelmChartInterpret { yaml_path, Some(&helm_options), ); - let status = match res { Ok(status) => status, Err(err) => return Err(InterpretError::new(err.to_string())), diff --git a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs index be8f0e3..a2e1a9e 100644 --- a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs +++ b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs @@ -1,3 +1,5 @@ +use std::any::Any; + use async_trait::async_trait; use serde::Serialize; use serde_yaml::{Mapping, Value}; @@ -11,7 +13,10 @@ use crate::{ }, prometheus::prometheus::{Prometheus, PrometheusReceiver}, }, - topology::{Url, oberservability::monitoring::AlertReceiver}, + topology::{ + Url, + oberservability::monitoring::{AlertReceiver, AlertSender}, + }, }; #[derive(Debug, Clone, Serialize)] diff --git a/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs index dc0b9d7..dd7c7e4 100644 --- a/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs @@ -6,13 +6,11 @@ use serde::Serialize; use crate::{ modules::monitoring::{ kube_prometheus::types::ServiceMonitor, - prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig}, + prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig}, }, score::Score, topology::{ - HelmCommand, Topology, - oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, - tenant::TenantManager, + oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology }, }; @@ -26,7 +24,7 @@ pub struct ApplicationPrometheusMonitoringScore { impl Score for ApplicationPrometheusMonitoringScore { fn create_interpret(&self) -> Box> { - let config = Arc::new(Mutex::new(PrometheusConfig::new())); + let config = Arc::new(Mutex::new(HelmPrometheusConfig::new())); config .try_lock() .expect("couldn't lock config") diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs index 7e24747..a1ef9b2 100644 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs @@ -1,6 +1,5 @@ -use non_blank_string_rs::NonBlankString; use std::str::FromStr; - +use non_blank_string_rs::NonBlankString; use crate::modules::helm::chart::HelmChartScore; pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { @@ -8,35 +7,46 @@ pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { r#" rbac: namespaced: true + datasources: datasources.yaml: apiVersion: 1 datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus-server.{ns}.svc.cluster.local - isDefault: true -downloadDashboards: - dashboards: - - url: https://raw.githubusercontent.com/grafana/grafana/main/devenv/dev-dashboards/node-exporter-full_rev1.json - file: node-exporter-full.json - - - url: https://grafana.com/api/dashboards/7685/revisions/1/download - file: kubernetes-pvs-usage.json + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus-server.{ns}.svc.cluster.local + isDefault: true - # Namespace resource usage vs quotas - - url: https://grafana.com/api/dashboards/17044/revisions/1/download - file: namespace-resources-vs-quotas.json +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + editable: true + options: + path: /var/lib/grafana/dashboards/default - # Kubernetes namespace resources (CPU, RAM, network) - - url: https://grafana.com/api/dashboards/9809/revisions/1/download - file: kubernetes-namespace-resources.json - - # Top 10 namespaces by memory usage - - url: https://grafana.com/api/dashboards/10678/revisions/1/download - file: top10-namespace-memory.json - "# +dashboards: + default: + compute-usage: + url: https://grafana.com/api/dashboards/315/revisions/1/download + pod-health: + url: https://grafana.com/api/dashboards/15758/revisions/1/download + namespace-resources: + url: https://grafana.com/api/dashboards/9809/revisions/1/download + namespace-resources-vs-quotas: + url: https://grafana.com/api/dashboards/17044/revisions/1/download + persistent-volume-usage: + url: https://grafana.com/api/dashboards/7685/revisions/1/download +"#, + ns = ns ); HelmChartScore { @@ -45,9 +55,10 @@ downloadDashboards: chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(), chart_version: None, values_overrides: None, - values_yaml: Some(values.to_string()), + values_yaml: Some(values), create_namespace: true, install_only: false, repository: None, } } + diff --git a/harmony/src/modules/monitoring/kube_prometheus/types.rs b/harmony/src/modules/monitoring/kube_prometheus/types.rs index 33bfcc3..8884f62 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/types.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs @@ -211,6 +211,8 @@ pub struct Selector { pub struct ServiceMonitor { pub name: String, + pub namespace: String, + // # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart pub additional_labels: Option>, @@ -261,6 +263,7 @@ impl Default for ServiceMonitor { fn default() -> Self { Self { name: Default::default(), + namespace: Default::default(), additional_labels: Default::default(), job_label: Default::default(), target_labels: Default::default(), diff --git a/harmony/src/modules/monitoring/prometheus/helm/mod.rs b/harmony/src/modules/monitoring/prometheus/helm/mod.rs index 431fc6c..3246f0f 100644 --- a/harmony/src/modules/monitoring/prometheus/helm/mod.rs +++ b/harmony/src/modules/monitoring/prometheus/helm/mod.rs @@ -1 +1,2 @@ pub mod prometheus_helm; +pub mod types; diff --git a/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs b/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs index 7afde8d..8df32ef 100644 --- a/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs +++ b/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs @@ -1,56 +1,145 @@ +use std::collections::BTreeMap; use std::str::FromStr; use std::sync::{Arc, Mutex}; +use log::debug; use non_blank_string_rs::NonBlankString; +use serde_yaml::{Mapping, Value}; -use crate::modules::{ - helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig, +use crate::modules::helm::chart::HelmChartScore; +use crate::modules::monitoring::kube_prometheus::types::{ + AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec, + ConfigReloader, Limits, Requests, Resources, }; +use crate::modules::monitoring::prometheus::helm::types::{ + AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor, + Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig, +}; +use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig; -pub fn prometheus_helm_chart_score(config: Arc>) -> HelmChartScore { +pub fn prometheus_helm_chart_score(config: Arc>) -> HelmChartScore { let config = config.lock().unwrap(); let ns = config.namespace.clone().unwrap(); - let values = format!( - r#" -rbac: - create: false -kube-state-metrics: - enabled: false -prometheus-node-exporter: - enabled: false -prometheus-pushgateway: - enabled: false -server: - releaseNamespace: true - clusterRole: false - clusterRoleBinding: false - rbac: - create: true - namespaced: true + let rbac_config = RbacConfig { create: false }; - serverFiles: - prometheus.yml: - scrape_configs: - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - serviceMonitorNamespaceSelector: - matchLabels: - kubernetes.io/metadata.name: {ns} - podMonitorNamespaceSelector: - matchLabels: - kubernetes.io/metadata.name: {ns} + let ksm_config = KubeStateMetricsConfig { + enabled: true, + rbac: KsmRbacConfig { + use_cluster_role: false, + }, + prometheus: Prometheus { + monitor: Monitor { enabled: true }, + }, + }; -alertmanager: - enabled: true - rbac: - create: true - namespaced: true -"# + let mut selector_labels = BTreeMap::new(); + selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone()); + let mut kube_state_metrics_labels = BTreeMap::new(); + kube_state_metrics_labels.insert( + "app.kubernetes.io/name".to_string(), + "kube-state-metrics".to_string(), ); + let selector = LabelSelector { + match_labels: selector_labels, + }; + + let server_config = ServerConfig { + namespaces: vec![ns.clone()], + use_existing_cluster_role_name: false, + }; + + let mut null_receiver = Mapping::new(); + null_receiver.insert( + Value::String("receiver".to_string()), + Value::String("default-receiver".to_string()), + ); + null_receiver.insert( + Value::String("matchers".to_string()), + Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), + ); + null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); + + let mut alert_manager_channel_config = AlertManagerConfig { + global: Mapping::new(), + route: AlertManagerRoute { + routes: vec![Value::Mapping(null_receiver)], + }, + receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()], + }; + for receiver in config.alert_receiver_configs.iter() { + if let Some(global) = receiver.channel_global_config.clone() { + alert_manager_channel_config + .global + .insert(global.0, global.1); + } + alert_manager_channel_config + .route + .routes + .push(receiver.channel_route.clone()); + alert_manager_channel_config + .receivers + .push(receiver.channel_receiver.clone()); + } + let alert_manager_values = AlertManager { + enabled: config.alert_manager, + config: alert_manager_channel_config, + alertmanager_spec: AlertManagerSpec { + resources: Resources { + limits: Limits { + memory: "100Mi".to_string(), + cpu: "100m".to_string(), + }, + requests: Requests { + memory: "100Mi".to_string(), + cpu: "100m".to_string(), + }, + }, + }, + init_config_reloader: ConfigReloader { + resources: Resources { + limits: Limits { + memory: "100Mi".to_string(), + cpu: "100m".to_string(), + }, + requests: Requests { + memory: "100Mi".to_string(), + cpu: "100m".to_string(), + }, + }, + }, + }; + + let mut result: BTreeMap = BTreeMap::new(); + for rule in config.alert_rules.clone().iter() { + for (name, group) in &rule.rules { + result + .entry("alerting_rules.yml".to_string()) + .and_modify(|e| e.groups.extend(group.groups.clone())) + .or_insert(AlertFile { + groups: group.groups.clone(), + }); + } + } + + let final_values = PrometheusHelmValues { + rbac: rbac_config, + kube_state_metrics: ksm_config, + server: server_config, + alertmanager: alert_manager_values, + server_files: result, + additional_service_monitors: config.additional_service_monitors.clone(), + prometheus_node_exporter: EnabledConfig { enabled: false }, + prometheus_pushgateway: EnabledConfig { enabled: false }, + }; + + let values_yaml = + serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values"); + + debug!("full values.yaml: \n{}", values_yaml); + HelmChartScore { - namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()), + namespace: Some(NonBlankString::from_str(&ns).unwrap()), release_name: NonBlankString::from_str("prometheus").unwrap(), chart_name: NonBlankString::from_str( "oci://ghcr.io/prometheus-community/charts/prometheus", @@ -58,9 +147,9 @@ alertmanager: .unwrap(), chart_version: None, values_overrides: None, - values_yaml: Some(values.to_string()), + values_yaml: Some(values_yaml), create_namespace: true, - install_only: false, + install_only: true, repository: None, } } diff --git a/harmony/src/modules/monitoring/prometheus/helm/types.rs b/harmony/src/modules/monitoring/prometheus/helm/types.rs new file mode 100644 index 0000000..89c96fe --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/helm/types.rs @@ -0,0 +1,94 @@ +use std::collections::BTreeMap; + +use serde::Serialize; + +use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{ + AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor +}}; + +#[derive(Debug, Clone, Serialize)] +pub struct RuleFilesConfig { + #[serde(rename = "ruleFiles")] + pub files: BTreeMap, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusHelmValues { + pub rbac: RbacConfig, + #[serde(rename = "kube-state-metrics")] + pub kube_state_metrics: KubeStateMetricsConfig, + pub server: ServerConfig, + pub alertmanager: AlertManager, // You already have this + #[serde(rename = "serverFiles")] + pub server_files: BTreeMap, // You already have this + pub additional_service_monitors: Vec, // You already have this + #[serde(rename = "prometheus-node-exporter")] + pub prometheus_node_exporter: EnabledConfig, + #[serde(rename = "prometheus-pushgateway")] + pub prometheus_pushgateway: EnabledConfig, +} + +#[derive(Serialize, Debug, Clone)] +pub struct AlertFile { + pub groups: Vec, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct RbacConfig { + pub create: bool, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct KubeStateMetricsConfig { + pub enabled: bool, + pub rbac: KsmRbacConfig, + pub prometheus: Prometheus, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct Prometheus { + pub monitor: Monitor +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct Monitor{ + pub enabled: bool +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct KsmRbacConfig { + pub use_cluster_role: bool, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct ServerConfig { + pub namespaces: Vec, + pub use_existing_cluster_role_name: bool, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct ServerRbacConfig { + pub create: bool, + pub use_cluster_role: bool, + pub namespaced: bool, +} + +#[derive(Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(rename = "matchLabels")] + pub match_labels: BTreeMap, +} + +#[derive(Serialize, Debug)] +pub struct EnabledConfig { + pub enabled: bool, +} diff --git a/harmony/src/modules/monitoring/prometheus/prometheus.rs b/harmony/src/modules/monitoring/prometheus/prometheus.rs index da955a3..b44175b 100644 --- a/harmony/src/modules/monitoring/prometheus/prometheus.rs +++ b/harmony/src/modules/monitoring/prometheus/prometheus.rs @@ -14,7 +14,7 @@ use crate::{ }, score::Score, topology::{ - HelmCommand, Topology, + HelmCommand, K8sclient, Topology, installable::Installable, oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, @@ -22,12 +22,12 @@ use crate::{ }; use super::{ - helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig, + helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig, }; #[derive(Debug)] pub struct Prometheus { - pub config: Arc>, + pub config: Arc>, } #[async_trait] @@ -40,7 +40,7 @@ impl AlertSender for Prometheus { impl Prometheus { pub fn new() -> Self { Self { - config: Arc::new(Mutex::new(PrometheusConfig::new())), + config: Arc::new(Mutex::new(HelmPrometheusConfig::new())), } } pub async fn configure_with_topology(&self, topology: &T) { diff --git a/harmony/src/modules/monitoring/prometheus/prometheus_config.rs b/harmony/src/modules/monitoring/prometheus/prometheus_config.rs index fc5449c..b65711f 100644 --- a/harmony/src/modules/monitoring/prometheus/prometheus_config.rs +++ b/harmony/src/modules/monitoring/prometheus/prometheus_config.rs @@ -3,9 +3,8 @@ use crate::modules::monitoring::kube_prometheus::types::{ }; #[derive(Debug)] -pub struct PrometheusConfig { +pub struct HelmPrometheusConfig { pub namespace: Option, - pub default_rules: bool, pub alert_manager: bool, pub node_exporter: bool, pub kube_state_metrics: bool, @@ -16,11 +15,10 @@ pub struct PrometheusConfig { pub additional_service_monitors: Vec, } -impl PrometheusConfig { +impl HelmPrometheusConfig { pub fn new() -> Self { Self { namespace: None, - default_rules: true, alert_manager: true, node_exporter: false, kube_state_metrics: false, diff --git a/harmony/src/modules/prometheus/alerts/k8s/mod.rs b/harmony/src/modules/prometheus/alerts/k8s/mod.rs index f01a9c8..60e68b9 100644 --- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs +++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs @@ -1 +1,2 @@ pub mod pvc; +pub mod pod; diff --git a/harmony/src/modules/prometheus/alerts/k8s/pod.rs b/harmony/src/modules/prometheus/alerts/k8s/pod.rs new file mode 100644 index 0000000..f06b6c4 --- /dev/null +++ b/harmony/src/modules/prometheus/alerts/k8s/pod.rs @@ -0,0 +1,38 @@ +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn pod_in_failed_state() -> PrometheusAlertRule { + PrometheusAlertRule::new( + "PodInFailedState", + // This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true). + "kube_pod_status_phase{phase=\"Failed\"} == 1", + ) + .for_duration("1m") // Fire if the pod is in this state for 1 minute. + .label("severity", "critical") // A failed pod is a critical issue. + .annotation( + "summary", + "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.", + ) + .annotation( + "description", + "The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.", + ) +} + +pub fn pod_restarting_frequently() -> PrometheusAlertRule { + PrometheusAlertRule::new( + "PodRestartingFrequently", + // This expression calculates the increase in the restart count over the last 30 minutes. + // Alert if a container has restarted more than 5 times. + "increase(kube_pod_container_status_restarts_total[30m]) > 5", + ) + .for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps. + .label("severity", "critical") // A crash-looping pod is effectively down. + .annotation( + "summary", + "Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.", + ) + .annotation( + "description", + "The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.", + ) +}