Merge pull request 'feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example' (#67) from feat/alert_rules into master

Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/67
This commit is contained in:
wjro 2025-06-26 13:16:38 +00:00
commit 29e74a2712
18 changed files with 320 additions and 85 deletions

View File

@ -43,14 +43,14 @@ async fn main() {
// K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
// locally, to development environment from a CI, to staging, and to production with settings
// that automatically adapt to each environment grade.
let maestro = Maestro::<K8sAnywhereTopology>::initialize(
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
)
.await
.unwrap();
// maestro.register_all(vec![Box::new(lamp_stack)]);
maestro.register_all(vec![Box::new(lamp_stack)]);
// Here we bootstrap the CLI, this gives some nice features if you need them
harmony_cli::init(maestro, None).await.unwrap();
}

View File

@ -1,9 +1,19 @@
use harmony::{
inventory::Inventory,
maestro::Maestro,
modules::monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
modules::{
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
},
prometheus::alerts::{
infra::dell_server::{
alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
global_storage_status_degraded_non_critical,
},
k8s::pvc::high_pvc_fill_rate_over_two_days,
},
},
topology::{K8sAnywhereTopology, Url},
};
@ -12,10 +22,28 @@ use harmony::{
async fn main() {
let discord_receiver = DiscordWebhook {
name: "test-discord".to_string(),
url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
};
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
let dell_system_storage_degraded = global_storage_status_degraded_non_critical();
let alert_global_storage_status_critical = alert_global_storage_status_critical();
let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable();
let additional_rules =
AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
let additional_rules2 = AlertManagerRuleGroup::new(
"dell-server-alerts",
vec![
dell_system_storage_degraded,
alert_global_storage_status_critical,
alert_global_storage_status_non_recoverable,
],
);
let alerting_score = HelmPrometheusAlertingScore {
receivers: vec![Box::new(discord_receiver)],
rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
};
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
Inventory::autoload(),

View File

@ -1,10 +1,11 @@
use async_trait::async_trait;
use log::debug;
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
topology::{HelmCommand, Topology, installable::Installable},
topology::{Topology, installable::Installable},
};
#[async_trait]
@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug {
pub struct AlertingInterpret<S: AlertSender> {
pub sender: S,
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
pub rules: Vec<Box<dyn AlertRule<S>>>,
}
#[async_trait]
@ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
for receiver in self.receivers.iter() {
receiver.install(&self.sender).await?;
}
for rule in self.rules.iter() {
debug!("installing rule: {:#?}", rule);
rule.install(&self.sender).await?;
}
self.sender.ensure_installed(inventory, topology).await?;
Ok(Outcome::success(format!(
"successfully installed alert sender {}",
@ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
}
#[async_trait]
pub trait AlertRule<S: AlertSender> {
async fn install(&self, sender: &S) -> Result<(), InterpretError>;
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
}
#[async_trait]

View File

@ -12,5 +12,6 @@ pub mod load_balancer;
pub mod monitoring;
pub mod okd;
pub mod opnsense;
pub mod prometheus;
pub mod tenant;
pub mod tftp;

View File

@ -0,0 +1 @@
pub mod prometheus_alert_rule;

View File

@ -0,0 +1,99 @@
use std::collections::{BTreeMap, HashMap};
use async_trait::async_trait;
use serde::Serialize;
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::kube_prometheus::{
prometheus::{Prometheus, PrometheusRule},
types::{AlertGroup, AlertManagerAdditionalPromRules},
},
topology::oberservability::monitoring::AlertRule,
};
#[async_trait]
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_rule(&self).await
}
fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl PrometheusRule for AlertManagerRuleGroup {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
let mut additional_prom_rules = BTreeMap::new();
additional_prom_rules.insert(
self.name.clone(),
AlertGroup {
groups: vec![self.clone()],
},
);
AlertManagerAdditionalPromRules {
rules: additional_prom_rules,
}
}
}
impl AlertManagerRuleGroup {
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
AlertManagerRuleGroup {
name: name.to_string().to_lowercase(),
rules,
}
}
}
#[derive(Debug, Clone, Serialize)]
///logical group of alert rules
///evaluates to:
///name:
/// groups:
/// - name: name
/// rules: PrometheusAlertRule
pub struct AlertManagerRuleGroup {
pub name: String,
pub rules: Vec<PrometheusAlertRule>,
}
#[derive(Debug, Clone, Serialize)]
pub struct PrometheusAlertRule {
pub alert: String,
pub expr: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub r#for: Option<String>,
pub labels: HashMap<String, String>,
pub annotations: HashMap<String, String>,
}
impl PrometheusAlertRule {
pub fn new(alert_name: &str, expr: &str) -> Self {
Self {
alert: alert_name.into(),
expr: expr.into(),
r#for: Some("1m".into()),
labels: HashMap::new(),
annotations: HashMap::new(),
}
}
pub fn for_duration(mut self, duration: &str) -> Self {
self.r#for = Some(duration.into());
self
}
pub fn label(mut self, key: &str, value: &str) -> Self {
self.labels.insert(key.into(), value.into());
self
}
pub fn annotation(mut self, key: &str, value: &str) -> Self {
self.annotations.insert(key.into(), value.into());
self
}
}

View File

@ -1,6 +1,9 @@
use serde::Serialize;
use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig;
use crate::modules::monitoring::{
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
};
#[derive(Debug, Clone, Serialize)]
pub struct KubePrometheusConfig {
@ -22,6 +25,7 @@ pub struct KubePrometheusConfig {
pub kube_state_metrics: bool,
pub prometheus_operator: bool,
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
}
impl KubePrometheusConfig {
pub fn new() -> Self {
@ -44,6 +48,7 @@ impl KubePrometheusConfig {
core_dns: false,
kube_scheduler: false,
alert_receiver_configs: vec![],
alert_rules: vec![],
}
}
}

View File

@ -3,6 +3,7 @@ use log::debug;
use non_blank_string_rs::NonBlankString;
use serde_yaml::{Mapping, Value};
use std::{
collections::BTreeMap,
str::FromStr,
sync::{Arc, Mutex},
};
@ -10,7 +11,8 @@ use std::{
use crate::modules::{
helm::chart::HelmChartScore,
monitoring::kube_prometheus::types::{
AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues,
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
AlertManagerRoute, AlertManagerValues,
},
};
@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score(
config: Arc<Mutex<KubePrometheusConfig>>,
) -> HelmChartScore {
let config = config.lock().unwrap();
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
//to the overrides or something leaving the user to deal with formatting here seems bad
let default_rules = config.default_rules.to_string();
let windows_monitoring = config.windows_monitoring.to_string();
let grafana = config.grafana.to_string();
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
let kubernetes_api_server = config.kubernetes_api_server.to_string();
let kubelet = config.kubelet.to_string();
let alert_manager = config.alert_manager.to_string();
let kube_controller_manager = config.kube_controller_manager.to_string();
let core_dns = config.core_dns.to_string();
let kube_etcd = config.kube_etcd.to_string();
@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score(
let prometheus = config.prometheus.to_string();
let mut values = format!(
r#"
additionalPrometheusRulesMap:
pods-status-alerts:
groups:
- name: pods
rules:
- alert: "[CRIT] POD not healthy"
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
description: |
A POD is in a non-ready state!
- **Pod**: {{{{ $labels.pod }}}}
- **Namespace**: {{{{ $labels.namespace }}}}
- alert: "[CRIT] POD crash looping"
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 0m
labels:
severity: critical
annotations:
title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
description: |
A POD is drowning in a crash loop!
- **Pod**: {{{{ $labels.pod }}}}
- **Namespace**: {{{{ $labels.namespace }}}}
- **Instance**: {{{{ $labels.instance }}}}
pvc-alerts:
groups:
- name: pvc-alerts
rules:
- alert: 'PVC Fill Over 95 Percent In 2 Days'
expr: |
(
kubelet_volume_stats_used_bytes
/
kubelet_volume_stats_capacity_bytes
) > 0.95
AND
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
/
kubelet_volume_stats_capacity_bytes
> 0.95
for: 1m
labels:
severity: warning
annotations:
description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
defaultRules:
create: {default_rules}
rules:
@ -156,6 +106,7 @@ prometheus:
"#,
);
// add required null receiver for prometheus alert manager
let mut null_receiver = Mapping::new();
null_receiver.insert(
Value::String("receiver".to_string()),
@ -167,6 +118,7 @@ prometheus:
);
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
//add alert channels
let mut alert_manager_channel_config = AlertManagerConfig {
global: Mapping::new(),
route: AlertManagerRoute {
@ -200,7 +152,38 @@ prometheus:
serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
values.push_str(&alert_manager_yaml);
//format alert manager additional rules for helm chart
let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new();
for additional_rule in config.alert_rules.clone() {
for (key, group) in additional_rule.rules {
merged_rules.insert(key, group);
}
}
let merged_rules = AlertManagerAdditionalPromRules {
rules: merged_rules,
};
let mut alert_manager_additional_rules = serde_yaml::Mapping::new();
let rules_value = serde_yaml::to_value(merged_rules).unwrap();
alert_manager_additional_rules.insert(
serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()),
rules_value,
);
let alert_manager_additional_rules_yaml =
serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML");
debug!(
"alert_rules_yaml:\n{:#}",
alert_manager_additional_rules_yaml
);
values.push_str(&alert_manager_additional_rules_yaml);
debug!("full values.yaml: \n {:#}", values);
HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),

View File

@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex};
use serde::Serialize;
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
use crate::{
score::Score,
topology::{
HelmCommand, Topology,
oberservability::monitoring::{AlertReceiver, AlertingInterpret},
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
},
};
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
#[derive(Clone, Debug, Serialize)]
pub struct HelmPrometheusAlertingScore {
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
}
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
@ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
},
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})
}
fn name(&self) -> String {
"HelmPrometheusAlertingScore".to_string()
}
}
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}

View File

@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex};
use async_trait::async_trait;
use log::debug;
use serde::Serialize;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
score,
topology::{
HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender,
HelmCommand, Topology,
installable::Installable,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
},
};
@ -18,7 +22,7 @@ use super::{
helm::{
config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
},
types::AlertManagerChannelConfig,
types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
};
#[async_trait]
@ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
inventory: &Inventory,
topology: &T,
) -> Result<(), InterpretError> {
//install_prometheus
self.install_prometheus(inventory, topology).await?;
Ok(())
}
@ -67,6 +70,20 @@ impl Prometheus {
)))
}
pub async fn install_rule(
&self,
prometheus_rule: &AlertManagerRuleGroup,
) -> Result<Outcome, InterpretError> {
let prometheus_rule = prometheus_rule.configure_rule().await;
let mut config = self.config.lock().unwrap();
config.alert_rules.push(prometheus_rule.clone());
Ok(Outcome::success(format!(
"Successfully installed alert rule: {:#?},",
prometheus_rule
)))
}
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
&self,
inventory: &Inventory,
@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
}
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
#[async_trait]
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
}
impl Serialize for Box<dyn AlertRule<Prometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl Clone for Box<dyn AlertRule<Prometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}

View File

@ -1,7 +1,11 @@
use std::collections::BTreeMap;
use async_trait::async_trait;
use serde::Serialize;
use serde_yaml::{Mapping, Sequence, Value};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
#[async_trait]
pub trait AlertChannelConfig {
async fn get_config(&self) -> AlertManagerChannelConfig;
@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig {
pub channel_route: Value,
pub channel_receiver: Value,
}
#[derive(Debug, Clone, Serialize)]
pub struct AlertManagerAdditionalPromRules {
#[serde(flatten)]
pub rules: BTreeMap<String, AlertGroup>,
}
#[derive(Debug, Clone, Serialize)]
pub struct AlertGroup {
pub groups: Vec<AlertManagerRuleGroup>,
}

View File

@ -1,2 +1,3 @@
pub mod alert_channel;
pub mod alert_rule;
pub mod kube_prometheus;

View File

@ -0,0 +1,40 @@
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule {
PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4")
.for_duration("5m")
.label("severity", "warning")
.annotation(
"description",
"- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
)
.annotation("title", " System storage status is in degraded state")
}
pub fn alert_global_storage_status_critical() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"GlobalStorageStatus critical",
"globalStorageStatus == 5",
)
.for_duration("5m")
.label("severity", "warning")
.annotation("title", "System storage status is critical at {{ $labels.instance }}")
.annotation(
"description",
"- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
)
}
pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"GlobalStorageStatus nonRecoverable",
"globalStorageStatus == 6",
)
.for_duration("5m")
.label("severity", "warning")
.annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}")
.annotation(
"description",
"- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
)
}

View File

@ -0,0 +1 @@
pub mod dell_server;

View File

@ -0,0 +1 @@
pub mod pvc;

View File

@ -0,0 +1,11 @@
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"PVC Fill Over 95 Percent In 2 Days",
"(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",)
.for_duration("1m")
.label("severity", "warning")
.annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.")
.annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",)
}

View File

@ -0,0 +1,2 @@
pub mod infra;
pub mod k8s;

View File

@ -0,0 +1 @@
pub mod alerts;