Merge pull request 'feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example' (#67) from feat/alert_rules into master
Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/67
This commit is contained in:
commit
29e74a2712
@ -43,14 +43,14 @@ async fn main() {
|
||||
// K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
|
||||
// locally, to development environment from a CI, to staging, and to production with settings
|
||||
// that automatically adapt to each environment grade.
|
||||
let maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// maestro.register_all(vec![Box::new(lamp_stack)]);
|
||||
maestro.register_all(vec![Box::new(lamp_stack)]);
|
||||
// Here we bootstrap the CLI, this gives some nice features if you need them
|
||||
harmony_cli::init(maestro, None).await.unwrap();
|
||||
}
|
||||
|
@ -1,9 +1,19 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
maestro::Maestro,
|
||||
modules::monitoring::{
|
||||
alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
||||
modules::{
|
||||
monitoring::{
|
||||
alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
||||
},
|
||||
prometheus::alerts::{
|
||||
infra::dell_server::{
|
||||
alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
|
||||
global_storage_status_degraded_non_critical,
|
||||
},
|
||||
k8s::pvc::high_pvc_fill_rate_over_two_days,
|
||||
},
|
||||
},
|
||||
topology::{K8sAnywhereTopology, Url},
|
||||
};
|
||||
@ -12,10 +22,28 @@ use harmony::{
|
||||
async fn main() {
|
||||
let discord_receiver = DiscordWebhook {
|
||||
name: "test-discord".to_string(),
|
||||
url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()),
|
||||
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
|
||||
};
|
||||
|
||||
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
|
||||
let dell_system_storage_degraded = global_storage_status_degraded_non_critical();
|
||||
let alert_global_storage_status_critical = alert_global_storage_status_critical();
|
||||
let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable();
|
||||
|
||||
let additional_rules =
|
||||
AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
|
||||
let additional_rules2 = AlertManagerRuleGroup::new(
|
||||
"dell-server-alerts",
|
||||
vec![
|
||||
dell_system_storage_degraded,
|
||||
alert_global_storage_status_critical,
|
||||
alert_global_storage_status_non_recoverable,
|
||||
],
|
||||
);
|
||||
|
||||
let alerting_score = HelmPrometheusAlertingScore {
|
||||
receivers: vec![Box::new(discord_receiver)],
|
||||
rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
|
||||
};
|
||||
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||
Inventory::autoload(),
|
||||
|
@ -1,10 +1,11 @@
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
|
||||
use crate::{
|
||||
data::{Id, Version},
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
topology::{HelmCommand, Topology, installable::Installable},
|
||||
topology::{Topology, installable::Installable},
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
||||
pub struct AlertingInterpret<S: AlertSender> {
|
||||
pub sender: S,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
|
||||
pub rules: Vec<Box<dyn AlertRule<S>>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
|
||||
for receiver in self.receivers.iter() {
|
||||
receiver.install(&self.sender).await?;
|
||||
}
|
||||
for rule in self.rules.iter() {
|
||||
debug!("installing rule: {:#?}", rule);
|
||||
rule.install(&self.sender).await?;
|
||||
}
|
||||
self.sender.ensure_installed(inventory, topology).await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully installed alert sender {}",
|
||||
@ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait AlertRule<S: AlertSender> {
|
||||
async fn install(&self, sender: &S) -> Result<(), InterpretError>;
|
||||
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
|
||||
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
@ -12,5 +12,6 @@ pub mod load_balancer;
|
||||
pub mod monitoring;
|
||||
pub mod okd;
|
||||
pub mod opnsense;
|
||||
pub mod prometheus;
|
||||
pub mod tenant;
|
||||
pub mod tftp;
|
||||
|
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod prometheus_alert_rule;
|
@ -0,0 +1,99 @@
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
modules::monitoring::kube_prometheus::{
|
||||
prometheus::{Prometheus, PrometheusRule},
|
||||
types::{AlertGroup, AlertManagerAdditionalPromRules},
|
||||
},
|
||||
topology::oberservability::monitoring::AlertRule,
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
|
||||
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
|
||||
sender.install_rule(&self).await
|
||||
}
|
||||
fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PrometheusRule for AlertManagerRuleGroup {
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
|
||||
let mut additional_prom_rules = BTreeMap::new();
|
||||
|
||||
additional_prom_rules.insert(
|
||||
self.name.clone(),
|
||||
AlertGroup {
|
||||
groups: vec![self.clone()],
|
||||
},
|
||||
);
|
||||
AlertManagerAdditionalPromRules {
|
||||
rules: additional_prom_rules,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertManagerRuleGroup {
|
||||
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
|
||||
AlertManagerRuleGroup {
|
||||
name: name.to_string().to_lowercase(),
|
||||
rules,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
///logical group of alert rules
|
||||
///evaluates to:
|
||||
///name:
|
||||
/// groups:
|
||||
/// - name: name
|
||||
/// rules: PrometheusAlertRule
|
||||
pub struct AlertManagerRuleGroup {
|
||||
pub name: String,
|
||||
pub rules: Vec<PrometheusAlertRule>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct PrometheusAlertRule {
|
||||
pub alert: String,
|
||||
pub expr: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub r#for: Option<String>,
|
||||
pub labels: HashMap<String, String>,
|
||||
pub annotations: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl PrometheusAlertRule {
|
||||
pub fn new(alert_name: &str, expr: &str) -> Self {
|
||||
Self {
|
||||
alert: alert_name.into(),
|
||||
expr: expr.into(),
|
||||
r#for: Some("1m".into()),
|
||||
labels: HashMap::new(),
|
||||
annotations: HashMap::new(),
|
||||
}
|
||||
}
|
||||
pub fn for_duration(mut self, duration: &str) -> Self {
|
||||
self.r#for = Some(duration.into());
|
||||
self
|
||||
}
|
||||
pub fn label(mut self, key: &str, value: &str) -> Self {
|
||||
self.labels.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn annotation(mut self, key: &str, value: &str) -> Self {
|
||||
self.annotations.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig;
|
||||
use crate::modules::monitoring::{
|
||||
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct KubePrometheusConfig {
|
||||
@ -22,6 +25,7 @@ pub struct KubePrometheusConfig {
|
||||
pub kube_state_metrics: bool,
|
||||
pub prometheus_operator: bool,
|
||||
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
|
||||
pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
|
||||
}
|
||||
impl KubePrometheusConfig {
|
||||
pub fn new() -> Self {
|
||||
@ -44,6 +48,7 @@ impl KubePrometheusConfig {
|
||||
core_dns: false,
|
||||
kube_scheduler: false,
|
||||
alert_receiver_configs: vec![],
|
||||
alert_rules: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ use log::debug;
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use serde_yaml::{Mapping, Value};
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
str::FromStr,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
@ -10,7 +11,8 @@ use std::{
|
||||
use crate::modules::{
|
||||
helm::chart::HelmChartScore,
|
||||
monitoring::kube_prometheus::types::{
|
||||
AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues,
|
||||
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
|
||||
AlertManagerRoute, AlertManagerValues,
|
||||
},
|
||||
};
|
||||
|
||||
@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score(
|
||||
config: Arc<Mutex<KubePrometheusConfig>>,
|
||||
) -> HelmChartScore {
|
||||
let config = config.lock().unwrap();
|
||||
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
|
||||
//to the overrides or something leaving the user to deal with formatting here seems bad
|
||||
|
||||
let default_rules = config.default_rules.to_string();
|
||||
let windows_monitoring = config.windows_monitoring.to_string();
|
||||
let grafana = config.grafana.to_string();
|
||||
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
|
||||
let kubernetes_api_server = config.kubernetes_api_server.to_string();
|
||||
let kubelet = config.kubelet.to_string();
|
||||
let alert_manager = config.alert_manager.to_string();
|
||||
let kube_controller_manager = config.kube_controller_manager.to_string();
|
||||
let core_dns = config.core_dns.to_string();
|
||||
let kube_etcd = config.kube_etcd.to_string();
|
||||
@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score(
|
||||
let prometheus = config.prometheus.to_string();
|
||||
let mut values = format!(
|
||||
r#"
|
||||
additionalPrometheusRulesMap:
|
||||
pods-status-alerts:
|
||||
groups:
|
||||
- name: pods
|
||||
rules:
|
||||
- alert: "[CRIT] POD not healthy"
|
||||
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is in a non-ready state!
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- alert: "[CRIT] POD crash looping"
|
||||
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is drowning in a crash loop!
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- **Instance**: {{{{ $labels.instance }}}}
|
||||
pvc-alerts:
|
||||
groups:
|
||||
- name: pvc-alerts
|
||||
rules:
|
||||
- alert: 'PVC Fill Over 95 Percent In 2 Days'
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_used_bytes
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes
|
||||
) > 0.95
|
||||
AND
|
||||
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes
|
||||
> 0.95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
|
||||
title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
|
||||
defaultRules:
|
||||
create: {default_rules}
|
||||
rules:
|
||||
@ -156,6 +106,7 @@ prometheus:
|
||||
"#,
|
||||
);
|
||||
|
||||
// add required null receiver for prometheus alert manager
|
||||
let mut null_receiver = Mapping::new();
|
||||
null_receiver.insert(
|
||||
Value::String("receiver".to_string()),
|
||||
@ -167,6 +118,7 @@ prometheus:
|
||||
);
|
||||
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
|
||||
|
||||
//add alert channels
|
||||
let mut alert_manager_channel_config = AlertManagerConfig {
|
||||
global: Mapping::new(),
|
||||
route: AlertManagerRoute {
|
||||
@ -200,7 +152,38 @@ prometheus:
|
||||
serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
|
||||
debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
|
||||
values.push_str(&alert_manager_yaml);
|
||||
|
||||
//format alert manager additional rules for helm chart
|
||||
let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new();
|
||||
|
||||
for additional_rule in config.alert_rules.clone() {
|
||||
for (key, group) in additional_rule.rules {
|
||||
merged_rules.insert(key, group);
|
||||
}
|
||||
}
|
||||
|
||||
let merged_rules = AlertManagerAdditionalPromRules {
|
||||
rules: merged_rules,
|
||||
};
|
||||
|
||||
let mut alert_manager_additional_rules = serde_yaml::Mapping::new();
|
||||
let rules_value = serde_yaml::to_value(merged_rules).unwrap();
|
||||
|
||||
alert_manager_additional_rules.insert(
|
||||
serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()),
|
||||
rules_value,
|
||||
);
|
||||
|
||||
let alert_manager_additional_rules_yaml =
|
||||
serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML");
|
||||
debug!(
|
||||
"alert_rules_yaml:\n{:#}",
|
||||
alert_manager_additional_rules_yaml
|
||||
);
|
||||
|
||||
values.push_str(&alert_manager_additional_rules_yaml);
|
||||
debug!("full values.yaml: \n {:#}", values);
|
||||
|
||||
HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
|
||||
|
@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
||||
use crate::{
|
||||
score::Score,
|
||||
topology::{
|
||||
HelmCommand, Topology,
|
||||
oberservability::monitoring::{AlertReceiver, AlertingInterpret},
|
||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
|
||||
},
|
||||
};
|
||||
|
||||
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct HelmPrometheusAlertingScore {
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
||||
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
|
||||
}
|
||||
|
||||
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
||||
@ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
||||
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
|
||||
},
|
||||
receivers: self.receivers.clone(),
|
||||
rules: self.rules.clone(),
|
||||
})
|
||||
}
|
||||
fn name(&self) -> String {
|
||||
"HelmPrometheusAlertingScore".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
score,
|
||||
topology::{
|
||||
HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender,
|
||||
HelmCommand, Topology,
|
||||
installable::Installable,
|
||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
|
||||
},
|
||||
};
|
||||
|
||||
@ -18,7 +22,7 @@ use super::{
|
||||
helm::{
|
||||
config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
|
||||
},
|
||||
types::AlertManagerChannelConfig,
|
||||
types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
@ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<(), InterpretError> {
|
||||
//install_prometheus
|
||||
self.install_prometheus(inventory, topology).await?;
|
||||
Ok(())
|
||||
}
|
||||
@ -67,6 +70,20 @@ impl Prometheus {
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn install_rule(
|
||||
&self,
|
||||
prometheus_rule: &AlertManagerRuleGroup,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let prometheus_rule = prometheus_rule.configure_rule().await;
|
||||
let mut config = self.config.lock().unwrap();
|
||||
|
||||
config.alert_rules.push(prometheus_rule.clone());
|
||||
Ok(Outcome::success(format!(
|
||||
"Successfully installed alert rule: {:#?},",
|
||||
prometheus_rule
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertRule<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertRule<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,11 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::Serialize;
|
||||
use serde_yaml::{Mapping, Sequence, Value};
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
|
||||
|
||||
#[async_trait]
|
||||
pub trait AlertChannelConfig {
|
||||
async fn get_config(&self) -> AlertManagerChannelConfig;
|
||||
@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig {
|
||||
pub channel_route: Value,
|
||||
pub channel_receiver: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct AlertManagerAdditionalPromRules {
|
||||
#[serde(flatten)]
|
||||
pub rules: BTreeMap<String, AlertGroup>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct AlertGroup {
|
||||
pub groups: Vec<AlertManagerRuleGroup>,
|
||||
}
|
||||
|
@ -1,2 +1,3 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod kube_prometheus;
|
||||
|
40
harmony/src/modules/prometheus/alerts/infra/dell_server.rs
Normal file
40
harmony/src/modules/prometheus/alerts/infra/dell_server.rs
Normal file
@ -0,0 +1,40 @@
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4")
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
.annotation("title", " System storage status is in degraded state")
|
||||
}
|
||||
|
||||
pub fn alert_global_storage_status_critical() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"GlobalStorageStatus critical",
|
||||
"globalStorageStatus == 5",
|
||||
)
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation("title", "System storage status is critical at {{ $labels.instance }}")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
}
|
||||
|
||||
pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"GlobalStorageStatus nonRecoverable",
|
||||
"globalStorageStatus == 6",
|
||||
)
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
}
|
1
harmony/src/modules/prometheus/alerts/infra/mod.rs
Normal file
1
harmony/src/modules/prometheus/alerts/infra/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod dell_server;
|
1
harmony/src/modules/prometheus/alerts/k8s/mod.rs
Normal file
1
harmony/src/modules/prometheus/alerts/k8s/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod pvc;
|
11
harmony/src/modules/prometheus/alerts/k8s/pvc.rs
Normal file
11
harmony/src/modules/prometheus/alerts/k8s/pvc.rs
Normal file
@ -0,0 +1,11 @@
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"PVC Fill Over 95 Percent In 2 Days",
|
||||
"(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",)
|
||||
.for_duration("1m")
|
||||
.label("severity", "warning")
|
||||
.annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.")
|
||||
.annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",)
|
||||
}
|
2
harmony/src/modules/prometheus/alerts/mod.rs
Normal file
2
harmony/src/modules/prometheus/alerts/mod.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod infra;
|
||||
pub mod k8s;
|
1
harmony/src/modules/prometheus/mod.rs
Normal file
1
harmony/src/modules/prometheus/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod alerts;
|
Loading…
Reference in New Issue
Block a user