feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example #67
@ -3,7 +3,18 @@ use harmony::{
|
||||
maestro::Maestro,
|
||||
modules::monitoring::{
|
||||
alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
||||
alert_rule::prometheus_alert_rule::{AlertManagerRuleGroup, PrometheusAlertRule},
|
||||
kube_prometheus::{
|
||||
alerts::{
|
||||
dell_server::{
|
||||
alert_global_storage_status_critical,
|
||||
alert_global_storage_status_non_recoverable,
|
||||
global_storage_status_degraded_non_critical,
|
||||
},
|
||||
pvc::high_pvc_fill_rate_over_two_days,
|
||||
},
|
||||
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
||||
},
|
||||
},
|
||||
topology::{K8sAnywhereTopology, Url},
|
||||
};
|
||||
@ -12,10 +23,28 @@ use harmony::{
|
||||
async fn main() {
|
||||
let discord_receiver = DiscordWebhook {
|
||||
name: "test-discord".to_string(),
|
||||
url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()),
|
||||
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
|
||||
};
|
||||
|
||||
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
|
||||
let dell_system_storage_degraded = global_storage_status_degraded_non_critical();
|
||||
let alert_global_storage_status_critical = alert_global_storage_status_critical();
|
||||
let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable();
|
||||
|
||||
let additional_rules =
|
||||
AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
|
||||
let additional_rules2 = AlertManagerRuleGroup::new(
|
||||
"dell-server-alerts",
|
||||
vec![
|
||||
dell_system_storage_degraded,
|
||||
alert_global_storage_status_critical,
|
||||
alert_global_storage_status_non_recoverable,
|
||||
],
|
||||
);
|
||||
|
||||
let alerting_score = HelmPrometheusAlertingScore {
|
||||
receivers: vec![Box::new(discord_receiver)],
|
||||
rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
|
||||
};
|
||||
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||
Inventory::autoload(),
|
||||
|
@ -1,10 +1,11 @@
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
|
||||
use crate::{
|
||||
data::{Id, Version},
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
topology::{HelmCommand, Topology, installable::Installable},
|
||||
topology::{Topology, installable::Installable},
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
||||
pub struct AlertingInterpret<S: AlertSender> {
|
||||
pub sender: S,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
|
||||
pub rules: Vec<Box<dyn AlertRule<S>>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
|
||||
for receiver in self.receivers.iter() {
|
||||
receiver.install(&self.sender).await?;
|
||||
}
|
||||
for rule in self.rules.iter() {
|
||||
debug!("installing rule: {:#?}", rule);
|
||||
rule.install(&self.sender).await?;
|
||||
}
|
||||
self.sender.ensure_installed(inventory, topology).await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully installed alert sender {}",
|
||||
@ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait AlertRule<S: AlertSender> {
|
||||
async fn install(&self, sender: &S) -> Result<(), InterpretError>;
|
||||
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
|
||||
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod prometheus_alert_rule;
|
@ -0,0 +1,99 @@
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
modules::monitoring::kube_prometheus::{
|
||||
prometheus::{Prometheus, PrometheusRule},
|
||||
types::{AlertGroup, AlertManagerAdditionalPromRules},
|
||||
},
|
||||
topology::oberservability::monitoring::AlertRule,
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
|
||||
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
|
||||
sender.install_rule(&self).await
|
||||
}
|
||||
fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PrometheusRule for AlertManagerRuleGroup {
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
|
||||
let mut additional_prom_rules = BTreeMap::new();
|
||||
|
||||
additional_prom_rules.insert(
|
||||
self.name.clone(),
|
||||
AlertGroup {
|
||||
groups: vec![self.clone()],
|
||||
},
|
||||
);
|
||||
AlertManagerAdditionalPromRules {
|
||||
rules: additional_prom_rules,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertManagerRuleGroup {
|
||||
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
|
||||
AlertManagerRuleGroup {
|
||||
name: name.to_string().to_lowercase(),
|
||||
rules,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
///logical group of alert rules
|
||||
///evaluates to:
|
||||
///name:
|
||||
/// groups:
|
||||
/// - name: name
|
||||
/// rules: PrometheusAlertRule
|
||||
pub struct AlertManagerRuleGroup {
|
||||
pub name: String,
|
||||
pub rules: Vec<PrometheusAlertRule>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct PrometheusAlertRule {
|
||||
pub alert: String,
|
||||
pub expr: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub r#for: Option<String>,
|
||||
pub labels: HashMap<String, String>,
|
||||
pub annotations: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl PrometheusAlertRule {
|
||||
pub fn new(alert_name: &str, expr: &str) -> Self {
|
||||
Self {
|
||||
alert: alert_name.into(),
|
||||
expr: expr.into(),
|
||||
r#for: Some("1m".into()),
|
||||
labels: HashMap::new(),
|
||||
annotations: HashMap::new(),
|
||||
}
|
||||
}
|
||||
pub fn for_duration(mut self, duration: &str) -> Self {
|
||||
self.r#for = Some(duration.into());
|
||||
self
|
||||
}
|
||||
pub fn label(mut self, key: &str, value: &str) -> Self {
|
||||
self.labels.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn annotation(mut self, key: &str, value: &str) -> Self {
|
||||
self.annotations.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule {
|
||||
|
||||
PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4")
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
.annotation("title", " System storage status is in degraded state")
|
||||
}
|
||||
|
||||
pub fn alert_global_storage_status_critical() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"GlobalStorageStatus critical",
|
||||
"globalStorageStatus == 5",
|
||||
)
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation("title", "System storage status is critical at {{ $labels.instance }}")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
}
|
||||
|
||||
pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"GlobalStorageStatus nonRecoverable",
|
||||
"globalStorageStatus == 6",
|
||||
)
|
||||
.for_duration("5m")
|
||||
.label("severity", "warning")
|
||||
.annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}")
|
||||
.annotation(
|
||||
"description",
|
||||
"- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||
)
|
||||
}
|
||||
johnride
commented
Nothing in this file seems Dell specific? Nothing in this file seems Dell specific?
|
@ -0,0 +1,2 @@
|
||||
pub mod dell_server;
|
||||
pub mod pvc;
|
11
harmony/src/modules/monitoring/kube_prometheus/alerts/pvc.rs
Normal file
11
harmony/src/modules/monitoring/kube_prometheus/alerts/pvc.rs
Normal file
@ -0,0 +1,11 @@
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
johnride
commented
For clarity, this is fine-ish to be in the kube_prometheus mod because it is specifically for pvc alerts but I think alert definitions should be in another module called just prometheus. kube_prometheus is for stuff specific to deploying prometheus on k8s. It is very possible to have a prometheus deployed somewhere else (AWS managed or grafana cloud maybe) which scrapes k8s targets and will want this alert. So the logic here would ask for For clarity, this is fine-ish to be in the kube_prometheus mod because it is specifically for pvc alerts but I think alert definitions should be in another module called just prometheus. kube_prometheus is for stuff specific to *deploying* prometheus on k8s. It is very possible to have a prometheus deployed somewhere else (AWS managed or grafana cloud maybe) which scrapes k8s targets and will want this alert.
So the logic here would ask for `modules/prometheus/alerts/k8s` as this is a k8s specific alert for any prometheus deployment.
wjro
commented
that makes sense that makes sense
|
||||
|
||||
pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new(
|
||||
"PVC Fill Over 95 Percent In 2 Days",
|
||||
"(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",)
|
||||
.for_duration("1m")
|
||||
.label("severity", "warning")
|
||||
.annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.")
|
||||
.annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",)
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig;
|
||||
use crate::modules::monitoring::{
|
||||
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct KubePrometheusConfig {
|
||||
@ -22,6 +25,7 @@ pub struct KubePrometheusConfig {
|
||||
pub kube_state_metrics: bool,
|
||||
pub prometheus_operator: bool,
|
||||
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
|
||||
pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
|
||||
}
|
||||
impl KubePrometheusConfig {
|
||||
pub fn new() -> Self {
|
||||
@ -44,6 +48,7 @@ impl KubePrometheusConfig {
|
||||
core_dns: false,
|
||||
kube_scheduler: false,
|
||||
alert_receiver_configs: vec![],
|
||||
alert_rules: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ use log::debug;
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use serde_yaml::{Mapping, Value};
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
str::FromStr,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
@ -10,7 +11,8 @@ use std::{
|
||||
use crate::modules::{
|
||||
helm::chart::HelmChartScore,
|
||||
monitoring::kube_prometheus::types::{
|
||||
AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues,
|
||||
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
|
||||
AlertManagerRoute, AlertManagerValues,
|
||||
},
|
||||
};
|
||||
|
||||
@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score(
|
||||
config: Arc<Mutex<KubePrometheusConfig>>,
|
||||
) -> HelmChartScore {
|
||||
let config = config.lock().unwrap();
|
||||
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
|
||||
//to the overrides or something leaving the user to deal with formatting here seems bad
|
||||
|
||||
let default_rules = config.default_rules.to_string();
|
||||
let windows_monitoring = config.windows_monitoring.to_string();
|
||||
let grafana = config.grafana.to_string();
|
||||
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
|
||||
let kubernetes_api_server = config.kubernetes_api_server.to_string();
|
||||
let kubelet = config.kubelet.to_string();
|
||||
let alert_manager = config.alert_manager.to_string();
|
||||
let kube_controller_manager = config.kube_controller_manager.to_string();
|
||||
let core_dns = config.core_dns.to_string();
|
||||
let kube_etcd = config.kube_etcd.to_string();
|
||||
@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score(
|
||||
let prometheus = config.prometheus.to_string();
|
||||
let mut values = format!(
|
||||
r#"
|
||||
additionalPrometheusRulesMap:
|
||||
pods-status-alerts:
|
||||
groups:
|
||||
- name: pods
|
||||
rules:
|
||||
- alert: "[CRIT] POD not healthy"
|
||||
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is in a non-ready state!
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- alert: "[CRIT] POD crash looping"
|
||||
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is drowning in a crash loop!
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- **Instance**: {{{{ $labels.instance }}}}
|
||||
pvc-alerts:
|
||||
groups:
|
||||
- name: pvc-alerts
|
||||
rules:
|
||||
- alert: 'PVC Fill Over 95 Percent In 2 Days'
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_used_bytes
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes
|
||||
) > 0.95
|
||||
AND
|
||||
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes
|
||||
> 0.95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
|
||||
title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
|
||||
defaultRules:
|
||||
create: {default_rules}
|
||||
rules:
|
||||
@ -156,6 +106,7 @@ prometheus:
|
||||
"#,
|
||||
);
|
||||
|
||||
// add required null receiver for prometheus alert manager
|
||||
let mut null_receiver = Mapping::new();
|
||||
null_receiver.insert(
|
||||
Value::String("receiver".to_string()),
|
||||
@ -167,6 +118,7 @@ prometheus:
|
||||
);
|
||||
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
|
||||
|
||||
//add alert channels
|
||||
let mut alert_manager_channel_config = AlertManagerConfig {
|
||||
global: Mapping::new(),
|
||||
route: AlertManagerRoute {
|
||||
@ -200,7 +152,38 @@ prometheus:
|
||||
serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
|
||||
debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
|
||||
values.push_str(&alert_manager_yaml);
|
||||
|
||||
//format alert manager additional rules for helm chart
|
||||
let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new();
|
||||
|
||||
for additional_rule in config.alert_rules.clone() {
|
||||
for (key, group) in additional_rule.rules {
|
||||
merged_rules.insert(key, group);
|
||||
}
|
||||
}
|
||||
|
||||
let merged_rules = AlertManagerAdditionalPromRules {
|
||||
rules: merged_rules,
|
||||
};
|
||||
|
||||
let mut alert_manager_additional_rules = serde_yaml::Mapping::new();
|
||||
let rules_value = serde_yaml::to_value(merged_rules).unwrap();
|
||||
|
||||
alert_manager_additional_rules.insert(
|
||||
serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()),
|
||||
rules_value,
|
||||
);
|
||||
|
||||
let alert_manager_additional_rules_yaml =
|
||||
serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML");
|
||||
debug!(
|
||||
"alert_rules_yaml:\n{:#}",
|
||||
alert_manager_additional_rules_yaml
|
||||
);
|
||||
|
||||
values.push_str(&alert_manager_additional_rules_yaml);
|
||||
debug!("full values.yaml: \n {:#}", values);
|
||||
|
||||
HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
|
||||
|
@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
||||
use crate::{
|
||||
score::Score,
|
||||
topology::{
|
||||
HelmCommand, Topology,
|
||||
oberservability::monitoring::{AlertReceiver, AlertingInterpret},
|
||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
|
||||
},
|
||||
};
|
||||
|
||||
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct HelmPrometheusAlertingScore {
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
||||
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
|
||||
}
|
||||
|
||||
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
||||
@ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
||||
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
|
||||
},
|
||||
receivers: self.receivers.clone(),
|
||||
rules: self.rules.clone(),
|
||||
})
|
||||
}
|
||||
fn name(&self) -> String {
|
||||
"HelmPrometheusAlertingScore".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
pub mod alerts;
|
||||
pub mod helm;
|
||||
pub mod helm_prometheus_alert_score;
|
||||
pub mod prometheus;
|
||||
|
@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
score,
|
||||
topology::{
|
||||
HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender,
|
||||
HelmCommand, Topology,
|
||||
installable::Installable,
|
||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
|
||||
},
|
||||
};
|
||||
|
||||
@ -18,7 +22,7 @@ use super::{
|
||||
helm::{
|
||||
config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
|
||||
},
|
||||
types::AlertManagerChannelConfig,
|
||||
types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||
};
|
||||
|
||||
#[async_trait]
|
||||
@ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<(), InterpretError> {
|
||||
//install_prometheus
|
||||
self.install_prometheus(inventory, topology).await?;
|
||||
Ok(())
|
||||
}
|
||||
@ -67,6 +70,20 @@ impl Prometheus {
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn install_rule(
|
||||
&self,
|
||||
prometheus_rule: &AlertManagerRuleGroup,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let prometheus_rule = prometheus_rule.configure_rule().await;
|
||||
let mut config = self.config.lock().unwrap();
|
||||
|
||||
config.alert_rules.push(prometheus_rule.clone());
|
||||
Ok(Outcome::success(format!(
|
||||
"Successfully installed alert rule: {:#?},",
|
||||
prometheus_rule
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
|
||||
}
|
||||
|
||||
impl Serialize for Box<dyn AlertRule<Prometheus>> {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn AlertRule<Prometheus>> {
|
||||
fn clone(&self) -> Self {
|
||||
self.clone_box()
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,11 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::Serialize;
|
||||
use serde_yaml::{Mapping, Sequence, Value};
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
|
||||
|
||||
#[async_trait]
|
||||
pub trait AlertChannelConfig {
|
||||
async fn get_config(&self) -> AlertManagerChannelConfig;
|
||||
@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig {
|
||||
pub channel_route: Value,
|
||||
pub channel_receiver: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct AlertManagerAdditionalPromRules {
|
||||
#[serde(flatten)]
|
||||
pub rules: BTreeMap<String, AlertGroup>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct AlertGroup {
|
||||
pub groups: Vec<AlertManagerRuleGroup>,
|
||||
}
|
||||
|
@ -1,2 +1,3 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod kube_prometheus;
|
||||
|
Loading…
Reference in New Issue
Block a user
Is this really Dell specific?
these are from the Dell snmp walk:
dell:
walk:
- 1.3.6.1.4.1.674.10892.5.2
- 1.3.6.1.4.1.674.10892.5.4
- 1.3.6.1.4.1.674.10892.5.5
metrics:
- name: globalSystemStatus
oid: 1.3.6.1.4.1.674.10892.5.2.1
type: gauge
help: This attribute defines the overall rollup status of all components in
the system being monitored by the remote access card - 1.3.6.1.4.1.674.10892.5.2.1
enum_values:
1: other
2: unknown
3: ok
4: nonCritical
5: critical
6: nonRecoverable
there are a bunch of other ones as well but i only included a few for the example. Each server type has a different snmp walk that translates to a different name from the appropriate MIB file. I believe the Dell Mib that this is from is DELL-MM-MIB.
https://github.com/librenms/librenms/tree/master/mibs/dell