Merge pull request 'feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example' (#67) from feat/alert_rules into master

Reviewed-on: #67
2025-06-26 13:16:38 +00:00
parent 2c706225a1 e16f8fa82e
commit 29e74a2712
18 changed files with 320 additions and 85 deletions
--- a/examples/lamp/src/main.rs
+++ b/examples/lamp/src/main.rs
@@ -43,14 +43,14 @@ async fn main() {
    // K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
    // locally, to development environment from a CI, to staging, and to production with settings
    // that automatically adapt to each environment grade.
-    let maestro = Maestro::<K8sAnywhereTopology>::initialize(
+    let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
        Inventory::autoload(),
        K8sAnywhereTopology::from_env(),
    )
    .await
    .unwrap();

-    // maestro.register_all(vec![Box::new(lamp_stack)]);
+    maestro.register_all(vec![Box::new(lamp_stack)]);
    // Here we bootstrap the CLI, this gives some nice features if you need them
    harmony_cli::init(maestro, None).await.unwrap();
 }
--- a/examples/monitoring/src/main.rs
+++ b/examples/monitoring/src/main.rs
@@ -1,9 +1,19 @@
 use harmony::{
    inventory::Inventory,
    maestro::Maestro,
-    modules::monitoring::{
-        alert_channel::discord_alert_channel::DiscordWebhook,
-        kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
+    modules::{
+        monitoring::{
+            alert_channel::discord_alert_channel::DiscordWebhook,
+            alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
+            kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
+        },
+        prometheus::alerts::{
+            infra::dell_server::{
+                alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
+                global_storage_status_degraded_non_critical,
+            },
+            k8s::pvc::high_pvc_fill_rate_over_two_days,
+        },
    },
    topology::{K8sAnywhereTopology, Url},
 };
@@ -12,10 +22,28 @@ use harmony::{
 async fn main() {
    let discord_receiver = DiscordWebhook {
        name: "test-discord".to_string(),
-        url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()),
+        url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
    };
+
+    let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
+    let dell_system_storage_degraded = global_storage_status_degraded_non_critical();
+    let alert_global_storage_status_critical = alert_global_storage_status_critical();
+    let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable();
+
+    let additional_rules =
+        AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
+    let additional_rules2 = AlertManagerRuleGroup::new(
+        "dell-server-alerts",
+        vec![
+            dell_system_storage_degraded,
+            alert_global_storage_status_critical,
+            alert_global_storage_status_non_recoverable,
+        ],
+    );
+
    let alerting_score = HelmPrometheusAlertingScore {
        receivers: vec![Box::new(discord_receiver)],
+        rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
    };
    let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
        Inventory::autoload(),
--- a/harmony/src/domain/topology/oberservability/monitoring.rs
+++ b/harmony/src/domain/topology/oberservability/monitoring.rs
@@ -1,10 +1,11 @@
 use async_trait::async_trait;
+use log::debug;

 use crate::{
    data::{Id, Version},
    interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
    inventory::Inventory,
-    topology::{HelmCommand, Topology, installable::Installable},
+    topology::{Topology, installable::Installable},
 };

 #[async_trait]
@@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug {
 pub struct AlertingInterpret<S: AlertSender> {
    pub sender: S,
    pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
+    pub rules: Vec<Box<dyn AlertRule<S>>>,
 }

 #[async_trait]
@@ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
        for receiver in self.receivers.iter() {
            receiver.install(&self.sender).await?;
        }
+        for rule in self.rules.iter() {
+            debug!("installing rule: {:#?}", rule);
+            rule.install(&self.sender).await?;
+        }
        self.sender.ensure_installed(inventory, topology).await?;
        Ok(Outcome::success(format!(
            "successfully installed alert sender {}",
@@ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
 }

 #[async_trait]
-pub trait AlertRule<S: AlertSender> {
-    async fn install(&self, sender: &S) -> Result<(), InterpretError>;
+pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
+    async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
+    fn clone_box(&self) -> Box<dyn AlertRule<S>>;
 }

 #[async_trait]
--- a/harmony/src/modules/mod.rs
+++ b/harmony/src/modules/mod.rs
@@ -12,5 +12,6 @@ pub mod load_balancer;
 pub mod monitoring;
 pub mod okd;
 pub mod opnsense;
+pub mod prometheus;
 pub mod tenant;
 pub mod tftp;
--- a/harmony/src/modules/monitoring/alert_rule/mod.rs
+++ b/harmony/src/modules/monitoring/alert_rule/mod.rs
@@ -0,0 +1 @@
+pub mod prometheus_alert_rule;
--- a/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs
+++ b/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs
@@ -0,0 +1,99 @@
+use std::collections::{BTreeMap, HashMap};
+
+use async_trait::async_trait;
+use serde::Serialize;
+
+use crate::{
+    interpret::{InterpretError, Outcome},
+    modules::monitoring::kube_prometheus::{
+        prometheus::{Prometheus, PrometheusRule},
+        types::{AlertGroup, AlertManagerAdditionalPromRules},
+    },
+    topology::oberservability::monitoring::AlertRule,
+};
+
+#[async_trait]
+impl AlertRule<Prometheus> for AlertManagerRuleGroup {
+    async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
+        sender.install_rule(&self).await
+    }
+    fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
+        Box::new(self.clone())
+    }
+}
+
+#[async_trait]
+impl PrometheusRule for AlertManagerRuleGroup {
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+    async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
+        let mut additional_prom_rules = BTreeMap::new();
+
+        additional_prom_rules.insert(
+            self.name.clone(),
+            AlertGroup {
+                groups: vec![self.clone()],
+            },
+        );
+        AlertManagerAdditionalPromRules {
+            rules: additional_prom_rules,
+        }
+    }
+}
+
+impl AlertManagerRuleGroup {
+    pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
+        AlertManagerRuleGroup {
+            name: name.to_string().to_lowercase(),
+            rules,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+///logical group of alert rules
+///evaluates to:
+///name:
+///  groups:
+///  - name: name
+///    rules: PrometheusAlertRule
+pub struct AlertManagerRuleGroup {
+    pub name: String,
+    pub rules: Vec<PrometheusAlertRule>,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct PrometheusAlertRule {
+    pub alert: String,
+    pub expr: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub r#for: Option<String>,
+    pub labels: HashMap<String, String>,
+    pub annotations: HashMap<String, String>,
+}
+
+impl PrometheusAlertRule {
+    pub fn new(alert_name: &str, expr: &str) -> Self {
+        Self {
+            alert: alert_name.into(),
+            expr: expr.into(),
+            r#for: Some("1m".into()),
+            labels: HashMap::new(),
+            annotations: HashMap::new(),
+        }
+    }
+    pub fn for_duration(mut self, duration: &str) -> Self {
+        self.r#for = Some(duration.into());
+        self
+    }
+    pub fn label(mut self, key: &str, value: &str) -> Self {
+        self.labels.insert(key.into(), value.into());
+        self
+    }
+
+    pub fn annotation(mut self, key: &str, value: &str) -> Self {
+        self.annotations.insert(key.into(), value.into());
+        self
+    }
+}
--- a/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/helm/config.rs
@@ -1,6 +1,9 @@
 use serde::Serialize;

-use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig;
+use crate::modules::monitoring::{
+    alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
+    kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
+};

 #[derive(Debug, Clone, Serialize)]
 pub struct KubePrometheusConfig {
@@ -22,6 +25,7 @@ pub struct KubePrometheusConfig {
    pub kube_state_metrics: bool,
    pub prometheus_operator: bool,
    pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
+    pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
 }
 impl KubePrometheusConfig {
    pub fn new() -> Self {
@@ -44,6 +48,7 @@ impl KubePrometheusConfig {
            core_dns: false,
            kube_scheduler: false,
            alert_receiver_configs: vec![],
+            alert_rules: vec![],
        }
    }
 }
--- a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs
@@ -3,6 +3,7 @@ use log::debug;
 use non_blank_string_rs::NonBlankString;
 use serde_yaml::{Mapping, Value};
 use std::{
+    collections::BTreeMap,
    str::FromStr,
    sync::{Arc, Mutex},
 };
@@ -10,7 +11,8 @@ use std::{
 use crate::modules::{
    helm::chart::HelmChartScore,
    monitoring::kube_prometheus::types::{
-        AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues,
+        AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
+        AlertManagerRoute, AlertManagerValues,
    },
 };

@@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score(
    config: Arc<Mutex<KubePrometheusConfig>>,
 ) -> HelmChartScore {
    let config = config.lock().unwrap();
-    //TODO this should be make into a rule with default formatting that can be easily passed as a vec
-    //to the overrides or something leaving the user to deal with formatting here seems bad
+
    let default_rules = config.default_rules.to_string();
    let windows_monitoring = config.windows_monitoring.to_string();
    let grafana = config.grafana.to_string();
    let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
    let kubernetes_api_server = config.kubernetes_api_server.to_string();
    let kubelet = config.kubelet.to_string();
-    let alert_manager = config.alert_manager.to_string();
    let kube_controller_manager = config.kube_controller_manager.to_string();
    let core_dns = config.core_dns.to_string();
    let kube_etcd = config.kube_etcd.to_string();
@@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score(
    let prometheus = config.prometheus.to_string();
    let mut values = format!(
        r#"
-additionalPrometheusRulesMap:
-  pods-status-alerts:
-    groups:
-      - name: pods
-        rules:
-          - alert: "[CRIT] POD not healthy"
-            expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
-            for: 0m
-            labels:
-              severity: critical
-            annotations:
-              title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
-              description: |
-               A POD is in a non-ready state!
-               - **Pod**: {{{{ $labels.pod }}}}
-               - **Namespace**: {{{{ $labels.namespace }}}}
-          - alert: "[CRIT] POD crash looping"
-            expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
-            for: 0m
-            labels:
-              severity: critical
-            annotations:
-              title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
-              description: |
-               A POD is drowning in a crash loop!
-               - **Pod**: {{{{ $labels.pod }}}}
-               - **Namespace**: {{{{ $labels.namespace }}}}
-               - **Instance**: {{{{ $labels.instance }}}}
-  pvc-alerts:
-    groups:
-      - name: pvc-alerts
-        rules:
-          - alert: 'PVC Fill Over 95 Percent In 2 Days'
-            expr: |
-              (
-                kubelet_volume_stats_used_bytes
-                /
-                kubelet_volume_stats_capacity_bytes
-              ) > 0.95
-              AND
-              predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
-              /
-              kubelet_volume_stats_capacity_bytes
-              > 0.95
-            for: 1m
-            labels:
-              severity: warning
-            annotations:
-              description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
-              title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
 defaultRules:
  create: {default_rules}
  rules:
@@ -156,6 +106,7 @@ prometheus:
 "#,
    );

+    // add required null receiver for prometheus alert manager
    let mut null_receiver = Mapping::new();
    null_receiver.insert(
        Value::String("receiver".to_string()),
@@ -167,6 +118,7 @@ prometheus:
    );
    null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));

+    //add alert channels
    let mut alert_manager_channel_config = AlertManagerConfig {
        global: Mapping::new(),
        route: AlertManagerRoute {
@@ -200,7 +152,38 @@ prometheus:
        serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
    debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
    values.push_str(&alert_manager_yaml);
+
+    //format alert manager additional rules for helm chart
+    let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new();
+
+    for additional_rule in config.alert_rules.clone() {
+        for (key, group) in additional_rule.rules {
+            merged_rules.insert(key, group);
+        }
+    }
+
+    let merged_rules = AlertManagerAdditionalPromRules {
+        rules: merged_rules,
+    };
+
+    let mut alert_manager_additional_rules = serde_yaml::Mapping::new();
+    let rules_value = serde_yaml::to_value(merged_rules).unwrap();
+
+    alert_manager_additional_rules.insert(
+        serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()),
+        rules_value,
+    );
+
+    let alert_manager_additional_rules_yaml =
+        serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML");
+    debug!(
+        "alert_rules_yaml:\n{:#}",
+        alert_manager_additional_rules_yaml
+    );
+
+    values.push_str(&alert_manager_additional_rules_yaml);
    debug!("full values.yaml: \n {:#}", values);
+
    HelmChartScore {
        namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
        release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
--- a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs
@@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex};

 use serde::Serialize;

+use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
 use crate::{
    score::Score,
    topology::{
        HelmCommand, Topology,
-        oberservability::monitoring::{AlertReceiver, AlertingInterpret},
+        oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
    },
 };

-use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
-
 #[derive(Clone, Debug, Serialize)]
 pub struct HelmPrometheusAlertingScore {
    pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
+    pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
 }

 impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
@@ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
                config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
            },
            receivers: self.receivers.clone(),
+            rules: self.rules.clone(),
        })
    }
    fn name(&self) -> String {
        "HelmPrometheusAlertingScore".to_string()
    }
 }
-
-impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
-    fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        todo!()
-    }
-}
-
-impl Clone for Box<dyn AlertReceiver<Prometheus>> {
-    fn clone(&self) -> Self {
-        self.clone_box()
-    }
-}
--- a/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs
@@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex};

 use async_trait::async_trait;
 use log::debug;
+use serde::Serialize;

 use crate::{
    interpret::{InterpretError, Outcome},
    inventory::Inventory,
+    modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
    score,
    topology::{
-        HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender,
+        HelmCommand, Topology,
+        installable::Installable,
+        oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
    },
 };

@@ -18,7 +22,7 @@ use super::{
    helm::{
        config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
    },
-    types::AlertManagerChannelConfig,
+    types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
 };

 #[async_trait]
@@ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
        inventory: &Inventory,
        topology: &T,
    ) -> Result<(), InterpretError> {
-        //install_prometheus
        self.install_prometheus(inventory, topology).await?;
        Ok(())
    }
@@ -67,6 +70,20 @@ impl Prometheus {
        )))
    }

+    pub async fn install_rule(
+        &self,
+        prometheus_rule: &AlertManagerRuleGroup,
+    ) -> Result<Outcome, InterpretError> {
+        let prometheus_rule = prometheus_rule.configure_rule().await;
+        let mut config = self.config.lock().unwrap();
+
+        config.alert_rules.push(prometheus_rule.clone());
+        Ok(Outcome::success(format!(
+            "Successfully installed alert rule: {:#?},",
+            prometheus_rule
+        )))
+    }
+
    pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
        &self,
        inventory: &Inventory,
@@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
    fn name(&self) -> String;
    async fn configure_receiver(&self) -> AlertManagerChannelConfig;
 }
+
+impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
+    fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        todo!()
+    }
+}
+
+impl Clone for Box<dyn AlertReceiver<Prometheus>> {
+    fn clone(&self) -> Self {
+        self.clone_box()
+    }
+}
+
+#[async_trait]
+pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> String;
+    async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
+}
+
+impl Serialize for Box<dyn AlertRule<Prometheus>> {
+    fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        todo!()
+    }
+}
+
+impl Clone for Box<dyn AlertRule<Prometheus>> {
+    fn clone(&self) -> Self {
+        self.clone_box()
+    }
+}
--- a/harmony/src/modules/monitoring/kube_prometheus/types.rs
+++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs
@@ -1,7 +1,11 @@
+use std::collections::BTreeMap;
+
 use async_trait::async_trait;
 use serde::Serialize;
 use serde_yaml::{Mapping, Sequence, Value};

+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
+
 #[async_trait]
 pub trait AlertChannelConfig {
    async fn get_config(&self) -> AlertManagerChannelConfig;
@@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig {
    pub channel_route: Value,
    pub channel_receiver: Value,
 }
+
+#[derive(Debug, Clone, Serialize)]
+pub struct AlertManagerAdditionalPromRules {
+    #[serde(flatten)]
+    pub rules: BTreeMap<String, AlertGroup>,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct AlertGroup {
+    pub groups: Vec<AlertManagerRuleGroup>,
+}
--- a/harmony/src/modules/monitoring/mod.rs
+++ b/harmony/src/modules/monitoring/mod.rs
@@ -1,2 +1,3 @@
 pub mod alert_channel;
+pub mod alert_rule;
 pub mod kube_prometheus;
--- a/harmony/src/modules/prometheus/alerts/infra/dell_server.rs
+++ b/harmony/src/modules/prometheus/alerts/infra/dell_server.rs
@@ -0,0 +1,40 @@
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule {
+    PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4")
+        .for_duration("5m")
+        .label("severity", "warning")
+        .annotation(
+            "description",
+            "- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
+        )
+        .annotation("title", " System storage status is in degraded state")
+}
+
+pub fn alert_global_storage_status_critical() -> PrometheusAlertRule {
+    PrometheusAlertRule::new(
+        "GlobalStorageStatus critical",
+        "globalStorageStatus == 5",
+    )
+    .for_duration("5m")
+    .label("severity", "warning")
+    .annotation("title", "System storage status is critical at {{ $labels.instance }}")
+    .annotation(
+        "description",
+        "- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
+    )
+}
+
+pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule {
+    PrometheusAlertRule::new(
+        "GlobalStorageStatus nonRecoverable",
+        "globalStorageStatus == 6",
+    )
+    .for_duration("5m")
+    .label("severity", "warning")
+    .annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}")
+    .annotation(
+        "description",
+        "- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
+    )
+}
--- a/harmony/src/modules/prometheus/alerts/infra/mod.rs
+++ b/harmony/src/modules/prometheus/alerts/infra/mod.rs
@@ -0,0 +1 @@
+pub mod dell_server;
--- a/harmony/src/modules/prometheus/alerts/k8s/mod.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/mod.rs
@@ -0,0 +1 @@
+pub mod pvc;
--- a/harmony/src/modules/prometheus/alerts/k8s/pvc.rs
+++ b/harmony/src/modules/prometheus/alerts/k8s/pvc.rs
@@ -0,0 +1,11 @@
+use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule {
+    PrometheusAlertRule::new(
+        "PVC Fill Over 95 Percent In 2 Days",
+        "(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",)
+        .for_duration("1m")
+        .label("severity", "warning")
+        .annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.")
+        .annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",)
+}
--- a/harmony/src/modules/prometheus/alerts/mod.rs
+++ b/harmony/src/modules/prometheus/alerts/mod.rs
@@ -0,0 +1,2 @@
+pub mod infra;
+pub mod k8s;
--- a/harmony/src/modules/prometheus/mod.rs
+++ b/harmony/src/modules/prometheus/mod.rs
@@ -0,0 +1 @@
+pub mod alerts;