wip:added alertreceiver and alert rules which are built and added to the yaml before deploying prometheus, added a few dashboards to grafana. Trying to fix prometheus-server clusterrole/role/serviceaccount so that it can discover targets and kubernetes in a namespaced release where it does not have access to clusterrole

fix: prometheus deploys as namespaced resource without prometheus-server clusterrole and clusterrolebinding
fix: deploys by default in the application name namespace
2025-07-09 15:09:38 -04:00 · 2025-07-07 14:33:09 -04:00 · 2025-07-07 13:24:21 -04:00
15 changed files with 389 additions and 72 deletions
--- a/examples/rust/src/main.rs
+++ b/examples/rust/src/main.rs
@@ -3,9 +3,18 @@ use std::{path::PathBuf, sync::Arc};
 use harmony::{
    inventory::Inventory,
    maestro::Maestro,
-    modules::application::{
+    modules::{
-        ApplicationScore, RustWebFramework, RustWebapp,
+        application::{
-        features::{ContinuousDelivery, Monitoring},
+            ApplicationScore, RustWebFramework, RustWebapp,
            features::{ContinuousDelivery, PrometheusMonitoring},
        },
        monitoring::{
            alert_channel::discord_alert_channel::DiscordWebhook,
            alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
        },
        prometheus::alerts::k8s::{
            pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days,
        },
    },
    topology::{K8sAnywhereTopology, Url},
 };
@@ -20,12 +29,25 @@ async fn main() {
        framework: Some(RustWebFramework::Leptos),
    });
    let pod_failed = pod_in_failed_state();
    let pod_failed_2 = pod_in_failed_state();
    let pod_failed_3 = pod_in_failed_state();
    let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]);
    let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]);
    let app = ApplicationScore {
        features: vec![
-            Box::new(ContinuousDelivery {
+            //Box::new(ContinuousDelivery {
            //    application: application.clone(),
            //}),
            Box::new(PrometheusMonitoring {
                application: application.clone(),
                alert_receivers: vec![Box::new(DiscordWebhook {
                    name: "dummy-discord".to_string(),
                    url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
                })],
                alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)],
            }),
            Box::new(Monitoring {}),
            // TODO add monitoring, backups, multisite ha, etc
        ],
        application,
--- a/harmony/src/domain/topology/oberservability/monitoring.rs
+++ b/harmony/src/domain/topology/oberservability/monitoring.rs
@@ -1,3 +1,5 @@
 use std::any::Any;
 use async_trait::async_trait;
 use log::debug;
@@ -9,7 +11,7 @@ use crate::{
 };
 #[async_trait]
-pub trait AlertSender: Send + Sync + std::fmt::Debug {
+pub trait AlertSender: Any + Send + Sync + std::fmt::Debug {
    fn name(&self) -> String;
 }
--- a/harmony/src/modules/application/features/monitoring.rs
+++ b/harmony/src/modules/application/features/monitoring.rs
@@ -1,3 +1,5 @@
 use std::sync::Arc;
 use async_trait::async_trait;
 use log::info;
@@ -7,31 +9,36 @@ use crate::{
        application::{Application, ApplicationFeature},
        monitoring::{
            application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore,
-            kube_prometheus::{
+            kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus,
                helm_prometheus_alert_score::HelmPrometheusAlertingScore,
                types::{NamespaceSelector, ServiceMonitor},
            },
        },
    },
    score::Score,
-    topology::{HelmCommand, Topology, tenant::TenantManager},
+    topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology},
 };
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Clone)]
-pub struct Monitoring {}
+pub struct PrometheusMonitoring {
    pub application: Arc<dyn Application>,
    pub alert_receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
    pub alert_rules: Vec<Box<dyn AlertRule<Prometheus>>>,
 }
 #[async_trait]
-impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for Monitoring {
+impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for PrometheusMonitoring {
    async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
        info!("Ensuring monitoring is available for application");
        let ns = self.application.name();
        let mut service_monitor = ServiceMonitor::default();
        service_monitor.name = ns.clone();
        service_monitor.namespace = ns.clone();
        service_monitor.namespace_selector = Some(NamespaceSelector {
            any: true,
-            match_names: vec![],
+            match_names: vec![ns.clone()],
        });
        let alerting_score = ApplicationPrometheusMonitoringScore {
-            receivers: vec![],
+            namespace: ns,
-            rules: vec![],
+            receivers: self.alert_receivers.clone(),
            rules: self.alert_rules.clone(),
            service_monitors: vec![service_monitor],
        };
--- a/harmony/src/modules/helm/chart.rs
+++ b/harmony/src/modules/helm/chart.rs
@@ -220,7 +220,6 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret {
            yaml_path,
            Some(&helm_options),
        );
        let status = match res {
            Ok(status) => status,
            Err(err) => return Err(InterpretError::new(err.to_string())),
--- a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs
+++ b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs
@@ -1,3 +1,5 @@
 use std::any::Any;
 use async_trait::async_trait;
 use serde::Serialize;
 use serde_yaml::{Mapping, Value};
@@ -11,7 +13,10 @@ use crate::{
        },
        prometheus::prometheus::{Prometheus, PrometheusReceiver},
    },
-    topology::{Url, oberservability::monitoring::AlertReceiver},
+    topology::{
        Url,
        oberservability::monitoring::{AlertReceiver, AlertSender},
    },
 };
 #[derive(Debug, Clone, Serialize)]
--- a/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs
+++ b/harmony/src/modules/monitoring/application_monitoring/k8s_application_monitoring_score.rs
@@ -1,22 +1,22 @@
 use std::sync::{Arc, Mutex};
 use log::debug;
 use serde::Serialize;
 use crate::{
    modules::monitoring::{
        kube_prometheus::types::ServiceMonitor,
-        prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig},
+        prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig},
    },
    score::Score,
    topology::{
-        HelmCommand, Topology,
+        oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology
        oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
        tenant::TenantManager,
    },
 };
 #[derive(Clone, Debug, Serialize)]
 pub struct ApplicationPrometheusMonitoringScore {
    pub namespace: String,
    pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
    pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
    pub service_monitors: Vec<ServiceMonitor>,
@@ -24,13 +24,17 @@ pub struct ApplicationPrometheusMonitoringScore {
 impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
    fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
-        let config = Arc::new(Mutex::new(PrometheusConfig::new()));
+        let config = Arc::new(Mutex::new(HelmPrometheusConfig::new()));
        config
            .try_lock()
            .expect("couldn't lock config")
            .additional_service_monitors = self.service_monitors.clone();
        let ns = self.namespace.clone();
        config.try_lock().expect("couldn't lock config").namespace = Some(ns.clone());
        debug!("set namespace to {}", ns);
        Box::new(AlertingInterpret {
-            sender: Prometheus::new(),
+            sender: Prometheus { config },
            receivers: self.receivers.clone(),
            rules: self.rules.clone(),
        })
--- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs
+++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs
@@ -1,6 +1,5 @@
 use non_blank_string_rs::NonBlankString;
 use std::str::FromStr;
-
+use non_blank_string_rs::NonBlankString;
 use crate::modules::helm::chart::HelmChartScore;
 pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
@@ -8,10 +7,46 @@ pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
        r#"
 rbac:
  namespaced: true
-sidecar:
+
-  dashboards:
+datasources:
-    enabled: true
+  datasources.yaml:
-        "#
+    apiVersion: 1
    datasources:
    - name: Prometheus
      type: prometheus
      access: proxy
      url: http://prometheus-server.{ns}.svc.cluster.local
      isDefault: true
 dashboardProviders:
  dashboardproviders.yaml:
    apiVersion: 1
    providers:
    - name: 'default'
      orgId: 1
      folder: ''
      type: file
      disableDeletion: false
      updateIntervalSeconds: 10 
      allowUiUpdates: true
      editable: true
      options:
        path: /var/lib/grafana/dashboards/default
 dashboards:
  default:
    compute-usage:
      url: https://grafana.com/api/dashboards/315/revisions/1/download
    pod-health:
      url: https://grafana.com/api/dashboards/15758/revisions/1/download
    namespace-resources:
      url: https://grafana.com/api/dashboards/9809/revisions/1/download
    namespace-resources-vs-quotas:
      url: https://grafana.com/api/dashboards/17044/revisions/1/download
    persistent-volume-usage:
      url: https://grafana.com/api/dashboards/7685/revisions/1/download
 "#,
        ns = ns
    );
    HelmChartScore {
@@ -20,9 +55,10 @@ sidecar:
        chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(),
        chart_version: None,
        values_overrides: None,
-        values_yaml: Some(values.to_string()),
+        values_yaml: Some(values),
        create_namespace: true,
-        install_only: true,
+        install_only: false,
        repository: None,
    }
 }
--- a/harmony/src/modules/monitoring/kube_prometheus/types.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs
@@ -211,6 +211,8 @@ pub struct Selector {
 pub struct ServiceMonitor {
    pub name: String,
    pub namespace: String,
    // # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
    pub additional_labels: Option<HashMap<String, String>>,
@@ -261,6 +263,7 @@ impl Default for ServiceMonitor {
    fn default() -> Self {
        Self {
            name: Default::default(),
            namespace: Default::default(),
            additional_labels: Default::default(),
            job_label: Default::default(),
            target_labels: Default::default(),
--- a/harmony/src/modules/monitoring/prometheus/helm/mod.rs
+++ b/harmony/src/modules/monitoring/prometheus/helm/mod.rs
@@ -1 +1,2 @@
 pub mod prometheus_helm;
 pub mod types;
--- a/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs
+++ b/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs
@@ -1,37 +1,145 @@
 use std::collections::BTreeMap;
 use std::str::FromStr;
 use std::sync::{Arc, Mutex};
 use log::debug;
 use non_blank_string_rs::NonBlankString;
 use serde_yaml::{Mapping, Value};
-use crate::modules::{
+use crate::modules::helm::chart::HelmChartScore;
-    helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig,
+use crate::modules::monitoring::kube_prometheus::types::{
    AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec,
    ConfigReloader, Limits, Requests, Resources,
 };
 use crate::modules::monitoring::prometheus::helm::types::{
    AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor,
    Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig,
 };
 use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig;
-pub fn prometheus_helm_chart_score(config: Arc<Mutex<PrometheusConfig>>) -> HelmChartScore {
+pub fn prometheus_helm_chart_score(config: Arc<Mutex<HelmPrometheusConfig>>) -> HelmChartScore {
    let config = config.lock().unwrap();
    let ns = config.namespace.clone().unwrap();
-    let values = format!(
+
-        r#"
+    let rbac_config = RbacConfig { create: false };
-rbac:
+
-  create: true
+    let ksm_config = KubeStateMetricsConfig {
-kube-state-metrics:
+        enabled: true,
-  enabled: false
+        rbac: KsmRbacConfig {
-nodeExporter:
+            use_cluster_role: false,
-  enabled: false
+        },
-alertmanager:
+        prometheus: Prometheus {
-  enabled: false
+            monitor: Monitor { enabled: true },
-pushgateway:
+        },
-  enabled: false
+    };
-server:
+
-  serviceAccount:
+    let mut selector_labels = BTreeMap::new();
-    create: false
+    selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone());
-  rbac:
+    let mut kube_state_metrics_labels = BTreeMap::new();
-    create: true
+    kube_state_metrics_labels.insert(
-fullnameOverride: prometheus-{ns}
+        "app.kubernetes.io/name".to_string(),
-"#
+        "kube-state-metrics".to_string(),
    );
    let selector = LabelSelector {
        match_labels: selector_labels,
    };
    let server_config = ServerConfig {
        namespaces: vec![ns.clone()],
        use_existing_cluster_role_name: false,
    };
    let mut null_receiver = Mapping::new();
    null_receiver.insert(
        Value::String("receiver".to_string()),
        Value::String("default-receiver".to_string()),
    );
    null_receiver.insert(
        Value::String("matchers".to_string()),
        Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
    );
    null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
    let mut alert_manager_channel_config = AlertManagerConfig {
        global: Mapping::new(),
        route: AlertManagerRoute {
            routes: vec![Value::Mapping(null_receiver)],
        },
        receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()],
    };
    for receiver in config.alert_receiver_configs.iter() {
        if let Some(global) = receiver.channel_global_config.clone() {
            alert_manager_channel_config
                .global
                .insert(global.0, global.1);
        }
        alert_manager_channel_config
            .route
            .routes
            .push(receiver.channel_route.clone());
        alert_manager_channel_config
            .receivers
            .push(receiver.channel_receiver.clone());
    }
    let alert_manager_values = AlertManager {
        enabled: config.alert_manager,
        config: alert_manager_channel_config,
        alertmanager_spec: AlertManagerSpec {
            resources: Resources {
                limits: Limits {
                    memory: "100Mi".to_string(),
                    cpu: "100m".to_string(),
                },
                requests: Requests {
                    memory: "100Mi".to_string(),
                    cpu: "100m".to_string(),
                },
            },
        },
        init_config_reloader: ConfigReloader {
            resources: Resources {
                limits: Limits {
                    memory: "100Mi".to_string(),
                    cpu: "100m".to_string(),
                },
                requests: Requests {
                    memory: "100Mi".to_string(),
                    cpu: "100m".to_string(),
                },
            },
        },
    };
    let mut result: BTreeMap<String, AlertFile> = BTreeMap::new();
    for rule in config.alert_rules.clone().iter() {
        for (name, group) in &rule.rules {
            result
                .entry("alerting_rules.yml".to_string())
                .and_modify(|e| e.groups.extend(group.groups.clone()))
                .or_insert(AlertFile {
                    groups: group.groups.clone(),
                });
        }
    }
    let final_values = PrometheusHelmValues {
        rbac: rbac_config,
        kube_state_metrics: ksm_config,
        server: server_config,
        alertmanager: alert_manager_values,
        server_files: result,
        additional_service_monitors: config.additional_service_monitors.clone(),
        prometheus_node_exporter: EnabledConfig { enabled: false },
        prometheus_pushgateway: EnabledConfig { enabled: false },
    };
    let values_yaml =
        serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values");
    debug!("full values.yaml: \n{}", values_yaml);
    HelmChartScore {
-        namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()),
+        namespace: Some(NonBlankString::from_str(&ns).unwrap()),
        release_name: NonBlankString::from_str("prometheus").unwrap(),
        chart_name: NonBlankString::from_str(
            "oci://ghcr.io/prometheus-community/charts/prometheus",
@@ -39,7 +147,7 @@ fullnameOverride: prometheus-{ns}
        .unwrap(),
        chart_version: None,
        values_overrides: None,
-        values_yaml: Some(values.to_string()),
+        values_yaml: Some(values_yaml),
        create_namespace: true,
        install_only: true,
        repository: None,
--- a/harmony/src/modules/monitoring/prometheus/helm/types.rs
+++ b/harmony/src/modules/monitoring/prometheus/helm/types.rs
@@ -0,0 +1,94 @@
 use std::collections::BTreeMap;
 use serde::Serialize;
 use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{
    AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor
 }};
 #[derive(Debug, Clone, Serialize)]
 pub struct RuleFilesConfig {
    #[serde(rename = "ruleFiles")]
    pub files: BTreeMap<String, AlertGroup>,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct PrometheusHelmValues {
    pub rbac: RbacConfig,
    #[serde(rename = "kube-state-metrics")]
    pub kube_state_metrics: KubeStateMetricsConfig,
    pub server: ServerConfig,
    pub alertmanager: AlertManager, // You already have this
    #[serde(rename = "serverFiles")]
    pub server_files: BTreeMap<String, AlertFile>, // You already have this
    pub additional_service_monitors: Vec<ServiceMonitor>, // You already have this
    #[serde(rename = "prometheus-node-exporter")]
    pub prometheus_node_exporter: EnabledConfig,
    #[serde(rename = "prometheus-pushgateway")]
    pub prometheus_pushgateway: EnabledConfig,
 }
 #[derive(Serialize, Debug, Clone)]
 pub struct AlertFile {
    pub groups: Vec<AlertManagerRuleGroup>,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct RbacConfig {
    pub create: bool,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct KubeStateMetricsConfig {
    pub enabled: bool,
    pub rbac: KsmRbacConfig,
    pub prometheus: Prometheus,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct Prometheus {
    pub monitor: Monitor
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct Monitor{
    pub enabled: bool
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct KsmRbacConfig {
    pub use_cluster_role: bool,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct ServerConfig {
    pub namespaces: Vec<String>,
    pub use_existing_cluster_role_name: bool,
 }
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "camelCase")]
 pub struct ServerRbacConfig {
    pub create: bool,
    pub use_cluster_role: bool,
    pub namespaced: bool,
 }
 #[derive(Serialize, Debug, Clone)]
 #[serde(rename_all = "camelCase")]
 pub struct LabelSelector {
    #[serde(rename = "matchLabels")]
    pub match_labels: BTreeMap<String, String>,
 }
 #[derive(Serialize, Debug)]
 pub struct EnabledConfig {
    pub enabled: bool,
 }
--- a/harmony/src/modules/monitoring/prometheus/prometheus.rs
+++ b/harmony/src/modules/monitoring/prometheus/prometheus.rs
@@ -14,7 +14,7 @@ use crate::{
    },
    score::Score,
    topology::{
-        HelmCommand, Topology,
+        HelmCommand, K8sclient, Topology,
        installable::Installable,
        oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
        tenant::TenantManager,
@@ -22,12 +22,12 @@ use crate::{
 };
 use super::{
-    helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig,
+    helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig,
 };
 #[derive(Debug)]
 pub struct Prometheus {
-    pub config: Arc<Mutex<PrometheusConfig>>,
+    pub config: Arc<Mutex<HelmPrometheusConfig>>,
 }
 #[async_trait]
@@ -40,18 +40,17 @@ impl AlertSender for Prometheus {
 impl Prometheus {
    pub fn new() -> Self {
        Self {
-            config: Arc::new(Mutex::new(PrometheusConfig::new())),
+            config: Arc::new(Mutex::new(HelmPrometheusConfig::new())),
        }
    }
    pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
-        let ns = topology
+        if let Some(cfg) = topology.get_tenant_config().await {
-            .get_tenant_config()
+            debug!("Overriding namespace with tenant config: {}", cfg.name);
-            .await
+            self.config.lock().unwrap().namespace = Some(cfg.name.clone());
-            .map(|cfg| cfg.name.clone())
+        } else {
-            .unwrap_or_else(|| "monitoring".to_string());
+            debug!("No tenant config found; keeping existing namespace.");
        }
        error!("This must be refactored, see comments in pr #74");
        debug!("NS: {}", ns);
        self.config.lock().unwrap().namespace = Some(ns);
    }
    pub async fn install_receiver(
--- a/harmony/src/modules/monitoring/prometheus/prometheus_config.rs
+++ b/harmony/src/modules/monitoring/prometheus/prometheus_config.rs
@@ -3,9 +3,8 @@ use crate::modules::monitoring::kube_prometheus::types::{
 };
 #[derive(Debug)]
-pub struct PrometheusConfig {
+pub struct HelmPrometheusConfig {
    pub namespace: Option<String>,
    pub default_rules: bool,
    pub alert_manager: bool,
    pub node_exporter: bool,
    pub kube_state_metrics: bool,
@@ -16,11 +15,10 @@ pub struct PrometheusConfig {
    pub additional_service_monitors: Vec<ServiceMonitor>,
 }
-impl PrometheusConfig {
+impl HelmPrometheusConfig {
    pub fn new() -> Self {
        Self {
            namespace: None,
            default_rules: true,
            alert_manager: true,
            node_exporter: false,
            kube_state_metrics: false,
--- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs
@@ -1 +1,2 @@
 pub mod pvc;
 pub mod pod;
--- a/harmony/src/modules/prometheus/alerts/k8s/pod.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/pod.rs
@@ -0,0 +1,38 @@
 use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
 pub fn pod_in_failed_state() -> PrometheusAlertRule {
    PrometheusAlertRule::new(
        "PodInFailedState",
        // This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true).
        "kube_pod_status_phase{phase=\"Failed\"} == 1",
    )
    .for_duration("1m") // Fire if the pod is in this state for 1 minute.
    .label("severity", "critical") // A failed pod is a critical issue.
    .annotation(
        "summary",
        "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.",
    )
    .annotation(
        "description",
        "The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.",
    )
 }
 pub fn pod_restarting_frequently() -> PrometheusAlertRule {
    PrometheusAlertRule::new(
        "PodRestartingFrequently",
        // This expression calculates the increase in the restart count over the last 30 minutes.
        // Alert if a container has restarted more than 5 times.
        "increase(kube_pod_container_status_restarts_total[30m]) > 5",
    )
    .for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps.
    .label("severity", "critical") // A crash-looping pod is effectively down.
    .annotation(
        "summary",
        "Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.",
    )
    .annotation(
        "description",
        "The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.",
    )
 }
Author	SHA1	Message	Date
Willem	5c628b37b7	wip:added alertreceiver and alert rules which are built and added to the yaml before deploying prometheus, added a few dashboards to grafana. Trying to fix prometheus-server clusterrole/role/serviceaccount so that it can discover targets and kubernetes in a namespaced release where it does not have access to clusterrole	2025-07-09 15:09:38 -04:00
Willem	31661aaaf1	fix: prometheus deploys as namespaced resource without prometheus-server clusterrole and clusterrolebinding	2025-07-07 14:33:09 -04:00
Willem	2c208df143	fix: deploys by default in the application name namespace	2025-07-07 13:24:21 -04:00
`@@ -1 +1,2 @@`
	`pub mod prometheus_helm;`	`pub mod prometheus_helm;`
		`pub mod types;`