Compare commits

...

3 Commits

15 changed files with 389 additions and 72 deletions

View File

@ -3,9 +3,18 @@ use std::{path::PathBuf, sync::Arc};
use harmony::{ use harmony::{
inventory::Inventory, inventory::Inventory,
maestro::Maestro, maestro::Maestro,
modules::application::{ modules::{
ApplicationScore, RustWebFramework, RustWebapp, application::{
features::{ContinuousDelivery, Monitoring}, ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, PrometheusMonitoring},
},
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
},
prometheus::alerts::k8s::{
pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days,
},
}, },
topology::{K8sAnywhereTopology, Url}, topology::{K8sAnywhereTopology, Url},
}; };
@ -20,12 +29,25 @@ async fn main() {
framework: Some(RustWebFramework::Leptos), framework: Some(RustWebFramework::Leptos),
}); });
let pod_failed = pod_in_failed_state();
let pod_failed_2 = pod_in_failed_state();
let pod_failed_3 = pod_in_failed_state();
let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]);
let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]);
let app = ApplicationScore { let app = ApplicationScore {
features: vec![ features: vec![
Box::new(ContinuousDelivery { //Box::new(ContinuousDelivery {
// application: application.clone(),
//}),
Box::new(PrometheusMonitoring {
application: application.clone(), application: application.clone(),
alert_receivers: vec![Box::new(DiscordWebhook {
name: "dummy-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
})],
alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)],
}), }),
Box::new(Monitoring {}),
// TODO add monitoring, backups, multisite ha, etc // TODO add monitoring, backups, multisite ha, etc
], ],
application, application,

View File

@ -1,3 +1,5 @@
use std::any::Any;
use async_trait::async_trait; use async_trait::async_trait;
use log::debug; use log::debug;
@ -9,7 +11,7 @@ use crate::{
}; };
#[async_trait] #[async_trait]
pub trait AlertSender: Send + Sync + std::fmt::Debug { pub trait AlertSender: Any + Send + Sync + std::fmt::Debug {
fn name(&self) -> String; fn name(&self) -> String;
} }

View File

@ -1,3 +1,5 @@
use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
use log::info; use log::info;
@ -7,31 +9,36 @@ use crate::{
application::{Application, ApplicationFeature}, application::{Application, ApplicationFeature},
monitoring::{ monitoring::{
application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore, application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore,
kube_prometheus::{ kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus,
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{NamespaceSelector, ServiceMonitor},
},
}, },
}, },
score::Score, score::Score,
topology::{HelmCommand, Topology, tenant::TenantManager}, topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology},
}; };
#[derive(Debug, Default, Clone)] #[derive(Debug, Clone)]
pub struct Monitoring {} pub struct PrometheusMonitoring {
pub application: Arc<dyn Application>,
pub alert_receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub alert_rules: Vec<Box<dyn AlertRule<Prometheus>>>,
}
#[async_trait] #[async_trait]
impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for Monitoring { impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for PrometheusMonitoring {
async fn ensure_installed(&self, topology: &T) -> Result<(), String> { async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application"); info!("Ensuring monitoring is available for application");
let ns = self.application.name();
let mut service_monitor = ServiceMonitor::default(); let mut service_monitor = ServiceMonitor::default();
service_monitor.name = ns.clone();
service_monitor.namespace = ns.clone();
service_monitor.namespace_selector = Some(NamespaceSelector { service_monitor.namespace_selector = Some(NamespaceSelector {
any: true, any: true,
match_names: vec![], match_names: vec![ns.clone()],
}); });
let alerting_score = ApplicationPrometheusMonitoringScore { let alerting_score = ApplicationPrometheusMonitoringScore {
receivers: vec![], namespace: ns,
rules: vec![], receivers: self.alert_receivers.clone(),
rules: self.alert_rules.clone(),
service_monitors: vec![service_monitor], service_monitors: vec![service_monitor],
}; };

View File

@ -220,7 +220,6 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret {
yaml_path, yaml_path,
Some(&helm_options), Some(&helm_options),
); );
let status = match res { let status = match res {
Ok(status) => status, Ok(status) => status,
Err(err) => return Err(InterpretError::new(err.to_string())), Err(err) => return Err(InterpretError::new(err.to_string())),

View File

@ -1,3 +1,5 @@
use std::any::Any;
use async_trait::async_trait; use async_trait::async_trait;
use serde::Serialize; use serde::Serialize;
use serde_yaml::{Mapping, Value}; use serde_yaml::{Mapping, Value};
@ -11,7 +13,10 @@ use crate::{
}, },
prometheus::prometheus::{Prometheus, PrometheusReceiver}, prometheus::prometheus::{Prometheus, PrometheusReceiver},
}, },
topology::{Url, oberservability::monitoring::AlertReceiver}, topology::{
Url,
oberservability::monitoring::{AlertReceiver, AlertSender},
},
}; };
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]

View File

@ -1,22 +1,22 @@
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use log::debug;
use serde::Serialize; use serde::Serialize;
use crate::{ use crate::{
modules::monitoring::{ modules::monitoring::{
kube_prometheus::types::ServiceMonitor, kube_prometheus::types::ServiceMonitor,
prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig}, prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig},
}, },
score::Score, score::Score,
topology::{ topology::{
HelmCommand, Topology, oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
tenant::TenantManager,
}, },
}; };
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
pub struct ApplicationPrometheusMonitoringScore { pub struct ApplicationPrometheusMonitoringScore {
pub namespace: String,
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>, pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>, pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
pub service_monitors: Vec<ServiceMonitor>, pub service_monitors: Vec<ServiceMonitor>,
@ -24,13 +24,17 @@ pub struct ApplicationPrometheusMonitoringScore {
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore { impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> { fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
let config = Arc::new(Mutex::new(PrometheusConfig::new())); let config = Arc::new(Mutex::new(HelmPrometheusConfig::new()));
config config
.try_lock() .try_lock()
.expect("couldn't lock config") .expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone(); .additional_service_monitors = self.service_monitors.clone();
let ns = self.namespace.clone();
config.try_lock().expect("couldn't lock config").namespace = Some(ns.clone());
debug!("set namespace to {}", ns);
Box::new(AlertingInterpret { Box::new(AlertingInterpret {
sender: Prometheus::new(), sender: Prometheus { config },
receivers: self.receivers.clone(), receivers: self.receivers.clone(),
rules: self.rules.clone(), rules: self.rules.clone(),
}) })

View File

@ -1,6 +1,5 @@
use non_blank_string_rs::NonBlankString;
use std::str::FromStr; use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore; use crate::modules::helm::chart::HelmChartScore;
pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
@ -8,10 +7,46 @@ pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
r#" r#"
rbac: rbac:
namespaced: true namespaced: true
sidecar:
dashboards: datasources:
enabled: true datasources.yaml:
"# apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-server.{ns}.svc.cluster.local
isDefault: true
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
compute-usage:
url: https://grafana.com/api/dashboards/315/revisions/1/download
pod-health:
url: https://grafana.com/api/dashboards/15758/revisions/1/download
namespace-resources:
url: https://grafana.com/api/dashboards/9809/revisions/1/download
namespace-resources-vs-quotas:
url: https://grafana.com/api/dashboards/17044/revisions/1/download
persistent-volume-usage:
url: https://grafana.com/api/dashboards/7685/revisions/1/download
"#,
ns = ns
); );
HelmChartScore { HelmChartScore {
@ -20,9 +55,10 @@ sidecar:
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(), chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(),
chart_version: None, chart_version: None,
values_overrides: None, values_overrides: None,
values_yaml: Some(values.to_string()), values_yaml: Some(values),
create_namespace: true, create_namespace: true,
install_only: true, install_only: false,
repository: None, repository: None,
} }
} }

View File

@ -211,6 +211,8 @@ pub struct Selector {
pub struct ServiceMonitor { pub struct ServiceMonitor {
pub name: String, pub name: String,
pub namespace: String,
// # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart // # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
pub additional_labels: Option<HashMap<String, String>>, pub additional_labels: Option<HashMap<String, String>>,
@ -261,6 +263,7 @@ impl Default for ServiceMonitor {
fn default() -> Self { fn default() -> Self {
Self { Self {
name: Default::default(), name: Default::default(),
namespace: Default::default(),
additional_labels: Default::default(), additional_labels: Default::default(),
job_label: Default::default(), job_label: Default::default(),
target_labels: Default::default(), target_labels: Default::default(),

View File

@ -1 +1,2 @@
pub mod prometheus_helm; pub mod prometheus_helm;
pub mod types;

View File

@ -1,37 +1,145 @@
use std::collections::BTreeMap;
use std::str::FromStr; use std::str::FromStr;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use log::debug;
use non_blank_string_rs::NonBlankString; use non_blank_string_rs::NonBlankString;
use serde_yaml::{Mapping, Value};
use crate::modules::{ use crate::modules::helm::chart::HelmChartScore;
helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig, use crate::modules::monitoring::kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec,
ConfigReloader, Limits, Requests, Resources,
}; };
use crate::modules::monitoring::prometheus::helm::types::{
AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor,
Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig,
};
use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig;
pub fn prometheus_helm_chart_score(config: Arc<Mutex<PrometheusConfig>>) -> HelmChartScore { pub fn prometheus_helm_chart_score(config: Arc<Mutex<HelmPrometheusConfig>>) -> HelmChartScore {
let config = config.lock().unwrap(); let config = config.lock().unwrap();
let ns = config.namespace.clone().unwrap(); let ns = config.namespace.clone().unwrap();
let values = format!(
r#" let rbac_config = RbacConfig { create: false };
rbac:
create: true let ksm_config = KubeStateMetricsConfig {
kube-state-metrics: enabled: true,
enabled: false rbac: KsmRbacConfig {
nodeExporter: use_cluster_role: false,
enabled: false },
alertmanager: prometheus: Prometheus {
enabled: false monitor: Monitor { enabled: true },
pushgateway: },
enabled: false };
server:
serviceAccount: let mut selector_labels = BTreeMap::new();
create: false selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone());
rbac: let mut kube_state_metrics_labels = BTreeMap::new();
create: true kube_state_metrics_labels.insert(
fullnameOverride: prometheus-{ns} "app.kubernetes.io/name".to_string(),
"# "kube-state-metrics".to_string(),
); );
let selector = LabelSelector {
match_labels: selector_labels,
};
let server_config = ServerConfig {
namespaces: vec![ns.clone()],
use_existing_cluster_role_name: false,
};
let mut null_receiver = Mapping::new();
null_receiver.insert(
Value::String("receiver".to_string()),
Value::String("default-receiver".to_string()),
);
null_receiver.insert(
Value::String("matchers".to_string()),
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
);
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
let mut alert_manager_channel_config = AlertManagerConfig {
global: Mapping::new(),
route: AlertManagerRoute {
routes: vec![Value::Mapping(null_receiver)],
},
receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()],
};
for receiver in config.alert_receiver_configs.iter() {
if let Some(global) = receiver.channel_global_config.clone() {
alert_manager_channel_config
.global
.insert(global.0, global.1);
}
alert_manager_channel_config
.route
.routes
.push(receiver.channel_route.clone());
alert_manager_channel_config
.receivers
.push(receiver.channel_receiver.clone());
}
let alert_manager_values = AlertManager {
enabled: config.alert_manager,
config: alert_manager_channel_config,
alertmanager_spec: AlertManagerSpec {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
init_config_reloader: ConfigReloader {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
};
let mut result: BTreeMap<String, AlertFile> = BTreeMap::new();
for rule in config.alert_rules.clone().iter() {
for (name, group) in &rule.rules {
result
.entry("alerting_rules.yml".to_string())
.and_modify(|e| e.groups.extend(group.groups.clone()))
.or_insert(AlertFile {
groups: group.groups.clone(),
});
}
}
let final_values = PrometheusHelmValues {
rbac: rbac_config,
kube_state_metrics: ksm_config,
server: server_config,
alertmanager: alert_manager_values,
server_files: result,
additional_service_monitors: config.additional_service_monitors.clone(),
prometheus_node_exporter: EnabledConfig { enabled: false },
prometheus_pushgateway: EnabledConfig { enabled: false },
};
let values_yaml =
serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values");
debug!("full values.yaml: \n{}", values_yaml);
HelmChartScore { HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()), namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("prometheus").unwrap(), release_name: NonBlankString::from_str("prometheus").unwrap(),
chart_name: NonBlankString::from_str( chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/prometheus", "oci://ghcr.io/prometheus-community/charts/prometheus",
@ -39,7 +147,7 @@ fullnameOverride: prometheus-{ns}
.unwrap(), .unwrap(),
chart_version: None, chart_version: None,
values_overrides: None, values_overrides: None,
values_yaml: Some(values.to_string()), values_yaml: Some(values_yaml),
create_namespace: true, create_namespace: true,
install_only: true, install_only: true,
repository: None, repository: None,

View File

@ -0,0 +1,94 @@
use std::collections::BTreeMap;
use serde::Serialize;
use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor
}};
#[derive(Debug, Clone, Serialize)]
pub struct RuleFilesConfig {
#[serde(rename = "ruleFiles")]
pub files: BTreeMap<String, AlertGroup>,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusHelmValues {
pub rbac: RbacConfig,
#[serde(rename = "kube-state-metrics")]
pub kube_state_metrics: KubeStateMetricsConfig,
pub server: ServerConfig,
pub alertmanager: AlertManager, // You already have this
#[serde(rename = "serverFiles")]
pub server_files: BTreeMap<String, AlertFile>, // You already have this
pub additional_service_monitors: Vec<ServiceMonitor>, // You already have this
#[serde(rename = "prometheus-node-exporter")]
pub prometheus_node_exporter: EnabledConfig,
#[serde(rename = "prometheus-pushgateway")]
pub prometheus_pushgateway: EnabledConfig,
}
#[derive(Serialize, Debug, Clone)]
pub struct AlertFile {
pub groups: Vec<AlertManagerRuleGroup>,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct RbacConfig {
pub create: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct KubeStateMetricsConfig {
pub enabled: bool,
pub rbac: KsmRbacConfig,
pub prometheus: Prometheus,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Prometheus {
pub monitor: Monitor
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Monitor{
pub enabled: bool
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct KsmRbacConfig {
pub use_cluster_role: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ServerConfig {
pub namespaces: Vec<String>,
pub use_existing_cluster_role_name: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ServerRbacConfig {
pub create: bool,
pub use_cluster_role: bool,
pub namespaced: bool,
}
#[derive(Serialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelector {
#[serde(rename = "matchLabels")]
pub match_labels: BTreeMap<String, String>,
}
#[derive(Serialize, Debug)]
pub struct EnabledConfig {
pub enabled: bool,
}

View File

@ -14,7 +14,7 @@ use crate::{
}, },
score::Score, score::Score,
topology::{ topology::{
HelmCommand, Topology, HelmCommand, K8sclient, Topology,
installable::Installable, installable::Installable,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
tenant::TenantManager, tenant::TenantManager,
@ -22,12 +22,12 @@ use crate::{
}; };
use super::{ use super::{
helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig, helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig,
}; };
#[derive(Debug)] #[derive(Debug)]
pub struct Prometheus { pub struct Prometheus {
pub config: Arc<Mutex<PrometheusConfig>>, pub config: Arc<Mutex<HelmPrometheusConfig>>,
} }
#[async_trait] #[async_trait]
@ -40,18 +40,17 @@ impl AlertSender for Prometheus {
impl Prometheus { impl Prometheus {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
config: Arc::new(Mutex::new(PrometheusConfig::new())), config: Arc::new(Mutex::new(HelmPrometheusConfig::new())),
} }
} }
pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) { pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
let ns = topology if let Some(cfg) = topology.get_tenant_config().await {
.get_tenant_config() debug!("Overriding namespace with tenant config: {}", cfg.name);
.await self.config.lock().unwrap().namespace = Some(cfg.name.clone());
.map(|cfg| cfg.name.clone()) } else {
.unwrap_or_else(|| "monitoring".to_string()); debug!("No tenant config found; keeping existing namespace.");
}
error!("This must be refactored, see comments in pr #74"); error!("This must be refactored, see comments in pr #74");
debug!("NS: {}", ns);
self.config.lock().unwrap().namespace = Some(ns);
} }
pub async fn install_receiver( pub async fn install_receiver(

View File

@ -3,9 +3,8 @@ use crate::modules::monitoring::kube_prometheus::types::{
}; };
#[derive(Debug)] #[derive(Debug)]
pub struct PrometheusConfig { pub struct HelmPrometheusConfig {
pub namespace: Option<String>, pub namespace: Option<String>,
pub default_rules: bool,
pub alert_manager: bool, pub alert_manager: bool,
pub node_exporter: bool, pub node_exporter: bool,
pub kube_state_metrics: bool, pub kube_state_metrics: bool,
@ -16,11 +15,10 @@ pub struct PrometheusConfig {
pub additional_service_monitors: Vec<ServiceMonitor>, pub additional_service_monitors: Vec<ServiceMonitor>,
} }
impl PrometheusConfig { impl HelmPrometheusConfig {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
namespace: None, namespace: None,
default_rules: true,
alert_manager: true, alert_manager: true,
node_exporter: false, node_exporter: false,
kube_state_metrics: false, kube_state_metrics: false,

View File

@ -1 +1,2 @@
pub mod pvc; pub mod pvc;
pub mod pod;

View File

@ -0,0 +1,38 @@
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn pod_in_failed_state() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"PodInFailedState",
// This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true).
"kube_pod_status_phase{phase=\"Failed\"} == 1",
)
.for_duration("1m") // Fire if the pod is in this state for 1 minute.
.label("severity", "critical") // A failed pod is a critical issue.
.annotation(
"summary",
"Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.",
)
.annotation(
"description",
"The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.",
)
}
pub fn pod_restarting_frequently() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"PodRestartingFrequently",
// This expression calculates the increase in the restart count over the last 30 minutes.
// Alert if a container has restarted more than 5 times.
"increase(kube_pod_container_status_restarts_total[30m]) > 5",
)
.for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps.
.label("severity", "critical") // A crash-looping pod is effectively down.
.annotation(
"summary",
"Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.",
)
.annotation(
"description",
"The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.",
)
}