feat: added default prometheus rules and grafana dashboard for application monitoring

2025-07-22 13:26:03 -04:00
parent 1d8b503bd2
commit b9e208f4cf
14 changed files with 400 additions and 105 deletions
--- a/harmony/src/modules/application/features/monitoring.rs
+++ b/harmony/src/modules/application/features/monitoring.rs
@@ -1,10 +1,8 @@
 use std::sync::Arc;

-use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore;
+use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore;
 use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver;
-use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{
-    build_rule_container_restarting, build_rule_pod_failed,
-};
+use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
 use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup;
 use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
    ServiceMonitor, ServiceMonitorSpec,
@@ -39,7 +37,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
    async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
        info!("Ensuring monitoring is available for application");
        let namespace = self.application.name().clone();
-        let mut alerting_score = HelmPrometheusApplicationAlertingScore {
+        let mut alerting_score = CRDApplicationAlertingScore {
            namespace: namespace.clone(),
            receivers: self.alert_receiver.clone(),
            service_monitors: self.service_monitors.clone(),
@@ -92,9 +90,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:

        alerting_score.receivers.push(Box::new(ntfy_receiver));

-        //TODO add service monitors to PrometheusApplicationMonitoring which can be
-        //deployed for the namespace using prometheus crd-servicemonitors
-        let mut service_monitor = ServiceMonitor {
+        let service_monitor = ServiceMonitor {
            metadata: ObjectMeta {
                name: Some(self.application.name().clone()),
                labels: Some(std::collections::BTreeMap::from([
@@ -110,22 +106,12 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
            },
            spec: ServiceMonitorSpec::default(),
        };
-        let service_mon_endpoint = ServiceMonitorEndpoint {
-            port: Some("http".into()),
-            interval: Some("30s".into()),
-            path: Some("/metrics".into()),
-            scheme: None,
-            relabelings: vec![],
-            metric_relabelings: vec![],
-        };
-
-        service_monitor.spec.endpoints.push(service_mon_endpoint);

        alerting_score.service_monitors.push(service_monitor);

        let rules_group = RuleGroup {
            name: format!("{}-rules", self.application.name().clone()),
-            rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
+            rules: build_default_application_rules(),
        };

        alerting_score.prometheus_rules.push(rules_group);
--- a/harmony/src/modules/monitoring/application_monitoring/helm_prometheus_application_alerting.rs
+++ b/harmony/src/modules/monitoring/application_monitoring/helm_prometheus_application_alerting.rs
@@ -19,6 +19,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
 use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
    PrometheusRule, PrometheusRuleSpec, RuleGroup,
 };
+use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
 use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor;
 use crate::topology::{K8sclient, Topology, k8s::K8sClient};
 use crate::{
@@ -37,16 +38,16 @@ use crate::{
 };

 #[derive(Clone, Debug, Serialize)]
-pub struct HelmPrometheusApplicationAlertingScore {
+pub struct CRDApplicationAlertingScore {
    pub namespace: String,
    pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
    pub service_monitors: Vec<ServiceMonitor>,
    pub prometheus_rules: Vec<RuleGroup>,
 }

-impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScore {
+impl<T: Topology + K8sclient> Score<T> for CRDApplicationAlertingScore {
    fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
-        Box::new(HelmPrometheusApplicationAlertingInterpret {
+        Box::new(CRDApplicationAlertingInterpret {
            namespace: self.namespace.clone(),
            receivers: self.receivers.clone(),
            service_monitors: self.service_monitors.clone(),
@@ -55,12 +56,12 @@ impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScor
    }

    fn name(&self) -> String {
-        "HelmPrometheusApplicationAlertingScore".into()
+        "CRDApplicationAlertingScore".into()
    }
 }

 #[derive(Clone, Debug)]
-pub struct HelmPrometheusApplicationAlertingInterpret {
+pub struct CRDApplicationAlertingInterpret {
    pub namespace: String,
    pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
    pub service_monitors: Vec<ServiceMonitor>,
@@ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret {
 }

 #[async_trait]
-impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlertingInterpret {
+impl<T: Topology + K8sclient> Interpret<T> for CRDApplicationAlertingInterpret {
    async fn execute(
        &self,
        _inventory: &Inventory,
@@ -85,7 +86,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
        self.install_monitors(self.service_monitors.clone(), &client)
            .await?;
        Ok(Outcome::success(format!(
-            "deployed application monitoring composants channels"
+            "deployed application monitoring composants"
        )))
    }

@@ -106,7 +107,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
    }
 }

-impl HelmPrometheusApplicationAlertingInterpret {
+impl CRDApplicationAlertingInterpret {
    async fn crd_exists(&self, crd: &str) -> bool {
        let output = Command::new("kubectl")
            .args(["get", "crd", crd])
@@ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret {
        json_data.insert("timeInterval".to_string(), "5s".to_string());
        let namespace = self.namespace.clone();

-        let json = format!(
-            r#"{{
-      "title": "UP Status Dashboard",
-      "timezone": "browser",
-      "panels": [
-        {{
-          "type": "table",
-          "title": "Service UP Status",
-          "gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }},
-          "targets": [
-            {{
-              "expr": "up{{namespace=\"{namespace}\"}}",
-              "format": "table",
-              "refId": "A"
-            }}
-          ],
-          "options": {{
-            "showHeader": true
-          }},
-          "fieldConfig": {{
-            "defaults": {{
-              "custom": {{}}
-            }},
-            "overrides": []
-          }}
-        }}
-      ],
-      "schemaVersion": 30,
-      "version": 1
-    }}"#
-        );
+        let json = build_default_dashboard(&namespace);

        let graf_data_source = GrafanaDatasource {
            metadata: ObjectMeta {
-                name: Some(self.namespace.clone()),
+                name: Some(format!("grafana-datasource-{}", self.namespace.clone())),
                namespace: Some(self.namespace.clone()),
                ..Default::default()
            },
@@ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret {

        let graf_dashboard = GrafanaDashboard {
            metadata: ObjectMeta {
-                name: Some(self.namespace.clone()),
+                name: Some(format!("grafana-dashboard-{}", self.namespace.clone())),
                namespace: Some(self.namespace.clone()),
                ..Default::default()
            },
@@ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret {

        let grafana = Grafana {
            metadata: ObjectMeta {
-                name: Some(self.namespace.clone()),
+                name: Some(format!("grafana-{}", self.namespace.clone())),
                namespace: Some(self.namespace.clone()),
                labels: Some(label.clone()),
                ..Default::default()
--- a/harmony/src/modules/monitoring/application_monitoring/mod.rs
+++ b/harmony/src/modules/monitoring/application_monitoring/mod.rs
@@ -1,2 +1,2 @@
-pub mod helm_prometheus_application_alerting;
+pub mod crd_application_monitoring_alerting;
 pub mod k8s_application_monitoring_score;
--- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs
@@ -1,38 +1,30 @@
 use std::collections::BTreeMap;

+use crate::modules::{
+    monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
+    prometheus::alerts::k8s::{
+        deployment::alert_deployment_unavailable,
+        pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
+        pvc::high_pvc_fill_rate_over_two_days,
+        service::alert_service_down,
+    },
+};
+
 use super::crd_prometheus_rules::Rule;

-pub fn build_rule_container_restarting() -> Rule {
-    Rule {
-        alert: Some("ContainerRestarting".into()),
-        expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()),
-        for_: Some("5m".into()),
-        labels: Some(BTreeMap::from([("severity".into(), "warning".into())])),
-        annotations: Some(BTreeMap::from([
-            (
-                "summary".into(),
-                "Container is restarting frequently".into(),
-            ),
-            (
-                "description".into(),
-                "Container in this namespace is restarting more than 3 times in 5 minutes.".into(),
-            ),
-        ])),
-    }
-}
-
-pub fn build_rule_pod_failed() -> Rule {
-    Rule {
-        alert: Some("PodFailed".into()),
-        expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()),
-        for_: Some("0m".into()),
-        labels: Some(BTreeMap::from([("severity".into(), "critical".into())])),
-        annotations: Some(BTreeMap::from([
-            ("summary".into(), "A pod has failed".into()),
-            (
-                "description".into(),
-                "One or more pods are in Failed phase.".into(),
-            ),
-        ])),
-    }
+pub fn build_default_application_rules() -> Vec<Rule> {
+    let pod_failed: Rule = pod_failed().into();
+    let container_restarting: Rule = alert_container_restarting().into();
+    let pod_not_ready: Rule = alert_pod_not_ready().into();
+    let service_down: Rule = alert_service_down().into();
+    let deployment_unavailable: Rule = alert_deployment_unavailable().into();
+    let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
+    vec![
+        pod_failed,
+        container_restarting,
+        pod_not_ready,
+        service_down,
+        deployment_unavailable,
+        high_pvc_fill_rate,
+    ]
 }
--- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs
@@ -4,8 +4,6 @@ use kube::CustomResource;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};

-use crate::modules::monitoring::kube_prometheus::types::Operator;
-
 use super::crd_prometheuses::LabelSelector;

 #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
--- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs
@@ -1,8 +1,12 @@
+use std::collections::BTreeMap;
+
 use kube::CustomResource;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};

-use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed};
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+use super::crd_default_rules::build_default_application_rules;

 #[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
 #[kube(
@@ -42,13 +46,14 @@ pub struct Rule {
    pub annotations: Option<std::collections::BTreeMap<String, String>>,
 }

-impl PrometheusRuleSpec {
-    pub fn with_default_rules() -> Self {
-        PrometheusRuleSpec {
-            groups: vec![RuleGroup {
-                name: "default.rules".into(),
-                rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
-            }],
+impl From<PrometheusAlertRule> for Rule {
+    fn from(value: PrometheusAlertRule) -> Self {
+        Rule {
+            alert: Some(value.alert),
+            expr: Some(value.expr),
+            for_: value.r#for,
+            labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
+            annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
        }
    }
 }
--- a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs
@@ -0,0 +1,203 @@
+pub fn build_default_dashboard(namespace: &str) -> String {
+    let dashboard = format!(
+        r#"{{
+  "annotations": {{
+    "list": []
+  }},
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 171105,
+  "panels": [
+    {{
+      "datasource": "$datasource",
+      "fieldConfig": {{
+        "defaults": {{
+          "unit": "short"
+        }},
+        "overrides": []
+      }},
+      "gridPos": {{
+        "h": 6,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      }},
+      "id": 1,
+      "options": {{
+        "reduceOptions": {{
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        }}
+      }},
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {{
+          "expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\",  phase=\"Running\"}})",
+          "legendFormat": "",
+          "refId": "A"
+        }}
+      ],
+      "title": "Pods in Namespace",
+      "type": "stat"
+    }},
+    {{
+      "datasource": "$datasource",
+      "fieldConfig": {{
+        "defaults": {{
+          "unit": "short"
+        }},
+        "overrides": []
+      }},
+      "gridPos": {{
+        "h": 6,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      }},
+      "id": 2,
+      "options": {{
+        "reduceOptions": {{
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        }}
+      }},
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {{
+          "expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
+          "legendFormat": "",
+          "refId": "A"
+        }}
+      ],
+      "title": "Pods in Failed State",
+      "type": "stat"
+    }},
+    {{
+      "datasource": "$datasource",
+      "fieldConfig": {{
+        "defaults": {{
+          "unit": "percentunit"
+        }},
+        "overrides": []
+      }},
+      "gridPos": {{
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 6
+      }},
+      "id": 3,
+      "options": {{
+        "reduceOptions": {{
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        }}
+      }},
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {{
+          "expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
+          "legendFormat": "",
+          "refId": "A"
+        }}
+      ],
+      "title": "Deployment Health (Available / Desired)",
+      "type": "stat"
+    }},
+    {{
+      "datasource": "$datasource",
+      "fieldConfig": {{
+        "defaults": {{
+          "unit": "short"
+        }},
+        "overrides": []
+      }},
+      "gridPos": {{
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      }},
+      "id": 4,
+      "options": {{
+        "reduceOptions": {{
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        }}
+      }},
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {{
+          "expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
+          "legendFormat": "{{{{pod}}}}",
+          "refId": "A"
+        }}
+      ],
+      "title": "Container Restarts (per pod)",
+      "type": "timeseries"
+    }},
+    {{
+      "datasource": "$datasource",
+      "fieldConfig": {{
+        "defaults": {{
+          "unit": "short"
+        }},
+        "overrides": []
+      }},
+      "gridPos": {{
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      }},
+      "id": 5,
+      "options": {{
+        "reduceOptions": {{
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        }}
+      }},
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {{
+          "expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
+          "legendFormat": "",
+          "refId": "A"
+        }}
+      ],
+      "title": "Firing Alerts in Namespace",
+      "type": "stat"
+    }}
+  ],
+  "schemaVersion": 36,
+  "templating": {{
+    "list": [
+      {{
+        "name": "datasource",
+        "type": "datasource",
+        "pluginId": "prometheus",
+        "label": "Prometheus",
+        "query": "prometheus",
+        "refresh": 1,
+        "hide": 0,
+        "current": {{
+          "selected": true,
+          "text": "Prometheus",
+          "value": "Prometheus"
+        }}
+      }}
+    ]
+  }},
+  "title": "Tenant Namespace Overview",
+  "version": 1
+}}"#
+    );
+    dashboard
+}
--- a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs
@@ -4,6 +4,7 @@ pub mod crd_default_rules;
 pub mod crd_grafana;
 pub mod crd_prometheus_rules;
 pub mod crd_prometheuses;
+pub mod grafana_default_dashboard;
 pub mod grafana_operator;
 pub mod prometheus_operator;
 pub mod role;
--- a/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs
+++ b/harmony/src/modules/monitoring/ntfy/helm/ntfy_helm_chart.rs
@@ -58,6 +58,7 @@ config:
    # web-root: "disable"
    enable-signup: false
    enable-login: "true"
+    enable-metrics: "true"

 persistence:
  enabled: true
--- a/harmony/src/modules/prometheus/alerts/k8s/deployment.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/deployment.rs
@@ -0,0 +1,23 @@
+use std::collections::HashMap;
+
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "DeploymentUnavailable".into(),
+        expr: "kube_deployment_status_replicas_unavailable > 0".into(),
+        r#for: Some("2m".into()),
+        labels: HashMap::from([("severity".into(), "warning".into())]),
+        annotations: HashMap::from([
+            (
+                "summary".into(),
+                "Deployment has unavailable replicas".into(),
+            ),
+            (
+                "description".into(),
+                "A deployment in this namespace has unavailable replicas for over 2 minutes."
+                    .into(),
+            ),
+        ]),
+    }
+}
--- a/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
@@ -0,0 +1,37 @@
+use std::collections::HashMap;
+
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn alert_high_memory_usage() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "HighMemoryUsage".into(),
+        expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
+            .into(),
+        r#for: Some("2m".into()),
+        labels: HashMap::from([("severity".into(), "warning".into())]),
+        annotations: HashMap::from([
+            ("summary".into(), "Pod is using high memory".into()),
+            (
+                "description".into(),
+                "A pod is consuming more than 500Mi of memory.".into(),
+            ),
+        ]),
+    }
+}
+
+pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "HighCPUUsage".into(),
+        expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
+            .into(),
+        r#for: Some("1m".into()),
+        labels: HashMap::from([("severity".into(), "warning".into())]),
+        annotations: HashMap::from([
+            ("summary".into(), "Pod is using high CPU".into()),
+            (
+                "description".into(),
+                "A pod is using more than 90% of a core over 1 minute.".into(),
+            ),
+        ]),
+    }
+}
--- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs
@@ -1 +1,5 @@
+pub mod deployment;
+pub mod memory_usage;
+pub mod pod;
 pub mod pvc;
+pub mod service;
--- a/harmony/src/modules/prometheus/alerts/k8s/pod.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/pod.rs
@@ -0,0 +1,55 @@
+use std::collections::HashMap;
+
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn pod_failed() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "PodFailed".into(),
+        expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
+        r#for: Some("2m".into()),
+        labels: HashMap::from([("severity".into(), "critical".into())]),
+        annotations: HashMap::from([
+            ("summary".into(), "A pod has failed".into()),
+            (
+                "description".into(),
+                "One or more pods are in Failed phase.".into(),
+            ),
+        ]),
+    }
+}
+
+pub fn alert_container_restarting() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "ContainerRestarting".into(),
+        expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
+        r#for: Some("5m".into()),
+        labels: HashMap::from([("severity".into(), "warning".into())]),
+        annotations: HashMap::from([
+            (
+                "summary".into(),
+                "Container is restarting frequently".into(),
+            ),
+            (
+                "description".into(),
+                "A container in this namespace has restarted more than 3 times in 5 minutes."
+                    .into(),
+            ),
+        ]),
+    }
+}
+
+pub fn alert_pod_not_ready() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "PodNotReady".into(),
+        expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
+        r#for: Some("2m".into()),
+        labels: HashMap::from([("severity".into(), "warning".into())]),
+        annotations: HashMap::from([
+            ("summary".into(), "Pod is not ready".into()),
+            (
+                "description".into(),
+                "A pod in the namespace is not reporting Ready status.".into(),
+            ),
+        ]),
+    }
+}
--- a/harmony/src/modules/prometheus/alerts/k8s/service.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/service.rs
@@ -0,0 +1,19 @@
+use std::collections::HashMap;
+
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn alert_service_down() -> PrometheusAlertRule {
+    PrometheusAlertRule {
+        alert: "ServiceDown".into(),
+        expr: "up == 0".into(),
+        r#for: Some("1m".into()),
+        labels: HashMap::from([("severity".into(), "critical".into())]),
+        annotations: HashMap::from([
+            ("summary".into(), "Service is down".into()),
+            (
+                "description".into(),
+                "A target service in the namespace is not responding to Prometheus scrapes.".into(),
+            ),
+        ]),
+    }
+}