Merge pull request 'feat: added alert rule and impl for prometheus as well as a few preconfigured bmc alerts for dell server that are used in the monitoring example' (#67) from feat/alert_rules into master
Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/67
This commit is contained in:
commit
29e74a2712
@ -43,14 +43,14 @@ async fn main() {
|
|||||||
// K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
|
// K8sAnywhereTopology as it is the most automatic one that enables you to easily deploy
|
||||||
// locally, to development environment from a CI, to staging, and to production with settings
|
// locally, to development environment from a CI, to staging, and to production with settings
|
||||||
// that automatically adapt to each environment grade.
|
// that automatically adapt to each environment grade.
|
||||||
let maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
K8sAnywhereTopology::from_env(),
|
K8sAnywhereTopology::from_env(),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// maestro.register_all(vec![Box::new(lamp_stack)]);
|
maestro.register_all(vec![Box::new(lamp_stack)]);
|
||||||
// Here we bootstrap the CLI, this gives some nice features if you need them
|
// Here we bootstrap the CLI, this gives some nice features if you need them
|
||||||
harmony_cli::init(maestro, None).await.unwrap();
|
harmony_cli::init(maestro, None).await.unwrap();
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,19 @@
|
|||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
maestro::Maestro,
|
maestro::Maestro,
|
||||||
modules::monitoring::{
|
modules::{
|
||||||
alert_channel::discord_alert_channel::DiscordWebhook,
|
monitoring::{
|
||||||
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
alert_channel::discord_alert_channel::DiscordWebhook,
|
||||||
|
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||||
|
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
|
||||||
|
},
|
||||||
|
prometheus::alerts::{
|
||||||
|
infra::dell_server::{
|
||||||
|
alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
|
||||||
|
global_storage_status_degraded_non_critical,
|
||||||
|
},
|
||||||
|
k8s::pvc::high_pvc_fill_rate_over_two_days,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
topology::{K8sAnywhereTopology, Url},
|
topology::{K8sAnywhereTopology, Url},
|
||||||
};
|
};
|
||||||
@ -12,10 +22,28 @@ use harmony::{
|
|||||||
async fn main() {
|
async fn main() {
|
||||||
let discord_receiver = DiscordWebhook {
|
let discord_receiver = DiscordWebhook {
|
||||||
name: "test-discord".to_string(),
|
name: "test-discord".to_string(),
|
||||||
url: Url::Url(url::Url::parse("discord.doesnt.exist.com").unwrap()),
|
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
|
||||||
|
let dell_system_storage_degraded = global_storage_status_degraded_non_critical();
|
||||||
|
let alert_global_storage_status_critical = alert_global_storage_status_critical();
|
||||||
|
let alert_global_storage_status_non_recoverable = alert_global_storage_status_non_recoverable();
|
||||||
|
|
||||||
|
let additional_rules =
|
||||||
|
AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
|
||||||
|
let additional_rules2 = AlertManagerRuleGroup::new(
|
||||||
|
"dell-server-alerts",
|
||||||
|
vec![
|
||||||
|
dell_system_storage_degraded,
|
||||||
|
alert_global_storage_status_critical,
|
||||||
|
alert_global_storage_status_non_recoverable,
|
||||||
|
],
|
||||||
|
);
|
||||||
|
|
||||||
let alerting_score = HelmPrometheusAlertingScore {
|
let alerting_score = HelmPrometheusAlertingScore {
|
||||||
receivers: vec![Box::new(discord_receiver)],
|
receivers: vec![Box::new(discord_receiver)],
|
||||||
|
rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
|
||||||
};
|
};
|
||||||
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
data::{Id, Version},
|
data::{Id, Version},
|
||||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
topology::{HelmCommand, Topology, installable::Installable},
|
topology::{Topology, installable::Installable},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -16,6 +17,7 @@ pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
|||||||
pub struct AlertingInterpret<S: AlertSender> {
|
pub struct AlertingInterpret<S: AlertSender> {
|
||||||
pub sender: S,
|
pub sender: S,
|
||||||
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
|
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
|
||||||
|
pub rules: Vec<Box<dyn AlertRule<S>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -28,6 +30,10 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
|
|||||||
for receiver in self.receivers.iter() {
|
for receiver in self.receivers.iter() {
|
||||||
receiver.install(&self.sender).await?;
|
receiver.install(&self.sender).await?;
|
||||||
}
|
}
|
||||||
|
for rule in self.rules.iter() {
|
||||||
|
debug!("installing rule: {:#?}", rule);
|
||||||
|
rule.install(&self.sender).await?;
|
||||||
|
}
|
||||||
self.sender.ensure_installed(inventory, topology).await?;
|
self.sender.ensure_installed(inventory, topology).await?;
|
||||||
Ok(Outcome::success(format!(
|
Ok(Outcome::success(format!(
|
||||||
"successfully installed alert sender {}",
|
"successfully installed alert sender {}",
|
||||||
@ -59,8 +65,9 @@ pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait AlertRule<S: AlertSender> {
|
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||||
async fn install(&self, sender: &S) -> Result<(), InterpretError>;
|
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
|
||||||
|
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
|
@ -12,5 +12,6 @@ pub mod load_balancer;
|
|||||||
pub mod monitoring;
|
pub mod monitoring;
|
||||||
pub mod okd;
|
pub mod okd;
|
||||||
pub mod opnsense;
|
pub mod opnsense;
|
||||||
|
pub mod prometheus;
|
||||||
pub mod tenant;
|
pub mod tenant;
|
||||||
pub mod tftp;
|
pub mod tftp;
|
||||||
|
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
1
harmony/src/modules/monitoring/alert_rule/mod.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod prometheus_alert_rule;
|
@ -0,0 +1,99 @@
|
|||||||
|
use std::collections::{BTreeMap, HashMap};
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
interpret::{InterpretError, Outcome},
|
||||||
|
modules::monitoring::kube_prometheus::{
|
||||||
|
prometheus::{Prometheus, PrometheusRule},
|
||||||
|
types::{AlertGroup, AlertManagerAdditionalPromRules},
|
||||||
|
},
|
||||||
|
topology::oberservability::monitoring::AlertRule,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
|
||||||
|
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
|
||||||
|
sender.install_rule(&self).await
|
||||||
|
}
|
||||||
|
fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl PrometheusRule for AlertManagerRuleGroup {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
self.name.clone()
|
||||||
|
}
|
||||||
|
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
|
||||||
|
let mut additional_prom_rules = BTreeMap::new();
|
||||||
|
|
||||||
|
additional_prom_rules.insert(
|
||||||
|
self.name.clone(),
|
||||||
|
AlertGroup {
|
||||||
|
groups: vec![self.clone()],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
AlertManagerAdditionalPromRules {
|
||||||
|
rules: additional_prom_rules,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AlertManagerRuleGroup {
|
||||||
|
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
|
||||||
|
AlertManagerRuleGroup {
|
||||||
|
name: name.to_string().to_lowercase(),
|
||||||
|
rules,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
///logical group of alert rules
|
||||||
|
///evaluates to:
|
||||||
|
///name:
|
||||||
|
/// groups:
|
||||||
|
/// - name: name
|
||||||
|
/// rules: PrometheusAlertRule
|
||||||
|
pub struct AlertManagerRuleGroup {
|
||||||
|
pub name: String,
|
||||||
|
pub rules: Vec<PrometheusAlertRule>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct PrometheusAlertRule {
|
||||||
|
pub alert: String,
|
||||||
|
pub expr: String,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub r#for: Option<String>,
|
||||||
|
pub labels: HashMap<String, String>,
|
||||||
|
pub annotations: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrometheusAlertRule {
|
||||||
|
pub fn new(alert_name: &str, expr: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
alert: alert_name.into(),
|
||||||
|
expr: expr.into(),
|
||||||
|
r#for: Some("1m".into()),
|
||||||
|
labels: HashMap::new(),
|
||||||
|
annotations: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn for_duration(mut self, duration: &str) -> Self {
|
||||||
|
self.r#for = Some(duration.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
pub fn label(mut self, key: &str, value: &str) -> Self {
|
||||||
|
self.labels.insert(key.into(), value.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn annotation(mut self, key: &str, value: &str) -> Self {
|
||||||
|
self.annotations.insert(key.into(), value.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
@ -1,6 +1,9 @@
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
use crate::modules::monitoring::kube_prometheus::types::AlertManagerChannelConfig;
|
use crate::modules::monitoring::{
|
||||||
|
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||||
|
kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
pub struct KubePrometheusConfig {
|
pub struct KubePrometheusConfig {
|
||||||
@ -22,6 +25,7 @@ pub struct KubePrometheusConfig {
|
|||||||
pub kube_state_metrics: bool,
|
pub kube_state_metrics: bool,
|
||||||
pub prometheus_operator: bool,
|
pub prometheus_operator: bool,
|
||||||
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
|
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
|
||||||
|
pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
|
||||||
}
|
}
|
||||||
impl KubePrometheusConfig {
|
impl KubePrometheusConfig {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
@ -44,6 +48,7 @@ impl KubePrometheusConfig {
|
|||||||
core_dns: false,
|
core_dns: false,
|
||||||
kube_scheduler: false,
|
kube_scheduler: false,
|
||||||
alert_receiver_configs: vec![],
|
alert_receiver_configs: vec![],
|
||||||
|
alert_rules: vec![],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ use log::debug;
|
|||||||
use non_blank_string_rs::NonBlankString;
|
use non_blank_string_rs::NonBlankString;
|
||||||
use serde_yaml::{Mapping, Value};
|
use serde_yaml::{Mapping, Value};
|
||||||
use std::{
|
use std::{
|
||||||
|
collections::BTreeMap,
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::{Arc, Mutex},
|
sync::{Arc, Mutex},
|
||||||
};
|
};
|
||||||
@ -10,7 +11,8 @@ use std::{
|
|||||||
use crate::modules::{
|
use crate::modules::{
|
||||||
helm::chart::HelmChartScore,
|
helm::chart::HelmChartScore,
|
||||||
monitoring::kube_prometheus::types::{
|
monitoring::kube_prometheus::types::{
|
||||||
AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerValues,
|
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
|
||||||
|
AlertManagerRoute, AlertManagerValues,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -18,15 +20,13 @@ pub fn kube_prometheus_helm_chart_score(
|
|||||||
config: Arc<Mutex<KubePrometheusConfig>>,
|
config: Arc<Mutex<KubePrometheusConfig>>,
|
||||||
) -> HelmChartScore {
|
) -> HelmChartScore {
|
||||||
let config = config.lock().unwrap();
|
let config = config.lock().unwrap();
|
||||||
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
|
|
||||||
//to the overrides or something leaving the user to deal with formatting here seems bad
|
|
||||||
let default_rules = config.default_rules.to_string();
|
let default_rules = config.default_rules.to_string();
|
||||||
let windows_monitoring = config.windows_monitoring.to_string();
|
let windows_monitoring = config.windows_monitoring.to_string();
|
||||||
let grafana = config.grafana.to_string();
|
let grafana = config.grafana.to_string();
|
||||||
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
|
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
|
||||||
let kubernetes_api_server = config.kubernetes_api_server.to_string();
|
let kubernetes_api_server = config.kubernetes_api_server.to_string();
|
||||||
let kubelet = config.kubelet.to_string();
|
let kubelet = config.kubelet.to_string();
|
||||||
let alert_manager = config.alert_manager.to_string();
|
|
||||||
let kube_controller_manager = config.kube_controller_manager.to_string();
|
let kube_controller_manager = config.kube_controller_manager.to_string();
|
||||||
let core_dns = config.core_dns.to_string();
|
let core_dns = config.core_dns.to_string();
|
||||||
let kube_etcd = config.kube_etcd.to_string();
|
let kube_etcd = config.kube_etcd.to_string();
|
||||||
@ -38,56 +38,6 @@ pub fn kube_prometheus_helm_chart_score(
|
|||||||
let prometheus = config.prometheus.to_string();
|
let prometheus = config.prometheus.to_string();
|
||||||
let mut values = format!(
|
let mut values = format!(
|
||||||
r#"
|
r#"
|
||||||
additionalPrometheusRulesMap:
|
|
||||||
pods-status-alerts:
|
|
||||||
groups:
|
|
||||||
- name: pods
|
|
||||||
rules:
|
|
||||||
- alert: "[CRIT] POD not healthy"
|
|
||||||
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
|
|
||||||
description: |
|
|
||||||
A POD is in a non-ready state!
|
|
||||||
- **Pod**: {{{{ $labels.pod }}}}
|
|
||||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
|
||||||
- alert: "[CRIT] POD crash looping"
|
|
||||||
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
|
|
||||||
description: |
|
|
||||||
A POD is drowning in a crash loop!
|
|
||||||
- **Pod**: {{{{ $labels.pod }}}}
|
|
||||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
|
||||||
- **Instance**: {{{{ $labels.instance }}}}
|
|
||||||
pvc-alerts:
|
|
||||||
groups:
|
|
||||||
- name: pvc-alerts
|
|
||||||
rules:
|
|
||||||
- alert: 'PVC Fill Over 95 Percent In 2 Days'
|
|
||||||
expr: |
|
|
||||||
(
|
|
||||||
kubelet_volume_stats_used_bytes
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_capacity_bytes
|
|
||||||
) > 0.95
|
|
||||||
AND
|
|
||||||
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_capacity_bytes
|
|
||||||
> 0.95
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
|
|
||||||
title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
|
|
||||||
defaultRules:
|
defaultRules:
|
||||||
create: {default_rules}
|
create: {default_rules}
|
||||||
rules:
|
rules:
|
||||||
@ -156,6 +106,7 @@ prometheus:
|
|||||||
"#,
|
"#,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// add required null receiver for prometheus alert manager
|
||||||
let mut null_receiver = Mapping::new();
|
let mut null_receiver = Mapping::new();
|
||||||
null_receiver.insert(
|
null_receiver.insert(
|
||||||
Value::String("receiver".to_string()),
|
Value::String("receiver".to_string()),
|
||||||
@ -167,6 +118,7 @@ prometheus:
|
|||||||
);
|
);
|
||||||
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
|
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
|
||||||
|
|
||||||
|
//add alert channels
|
||||||
let mut alert_manager_channel_config = AlertManagerConfig {
|
let mut alert_manager_channel_config = AlertManagerConfig {
|
||||||
global: Mapping::new(),
|
global: Mapping::new(),
|
||||||
route: AlertManagerRoute {
|
route: AlertManagerRoute {
|
||||||
@ -200,7 +152,38 @@ prometheus:
|
|||||||
serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
|
serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML");
|
||||||
debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
|
debug!("serialized alert manager: \n {:#}", alert_manager_yaml);
|
||||||
values.push_str(&alert_manager_yaml);
|
values.push_str(&alert_manager_yaml);
|
||||||
|
|
||||||
|
//format alert manager additional rules for helm chart
|
||||||
|
let mut merged_rules: BTreeMap<String, AlertGroup> = BTreeMap::new();
|
||||||
|
|
||||||
|
for additional_rule in config.alert_rules.clone() {
|
||||||
|
for (key, group) in additional_rule.rules {
|
||||||
|
merged_rules.insert(key, group);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let merged_rules = AlertManagerAdditionalPromRules {
|
||||||
|
rules: merged_rules,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut alert_manager_additional_rules = serde_yaml::Mapping::new();
|
||||||
|
let rules_value = serde_yaml::to_value(merged_rules).unwrap();
|
||||||
|
|
||||||
|
alert_manager_additional_rules.insert(
|
||||||
|
serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()),
|
||||||
|
rules_value,
|
||||||
|
);
|
||||||
|
|
||||||
|
let alert_manager_additional_rules_yaml =
|
||||||
|
serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML");
|
||||||
|
debug!(
|
||||||
|
"alert_rules_yaml:\n{:#}",
|
||||||
|
alert_manager_additional_rules_yaml
|
||||||
|
);
|
||||||
|
|
||||||
|
values.push_str(&alert_manager_additional_rules_yaml);
|
||||||
debug!("full values.yaml: \n {:#}", values);
|
debug!("full values.yaml: \n {:#}", values);
|
||||||
|
|
||||||
HelmChartScore {
|
HelmChartScore {
|
||||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||||
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
|
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
|
||||||
|
@ -2,19 +2,19 @@ use std::sync::{Arc, Mutex};
|
|||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
||||||
use crate::{
|
use crate::{
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{
|
topology::{
|
||||||
HelmCommand, Topology,
|
HelmCommand, Topology,
|
||||||
oberservability::monitoring::{AlertReceiver, AlertingInterpret},
|
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
#[derive(Clone, Debug, Serialize)]
|
||||||
pub struct HelmPrometheusAlertingScore {
|
pub struct HelmPrometheusAlertingScore {
|
||||||
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
||||||
|
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
||||||
@ -24,24 +24,10 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
|
|||||||
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
|
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
|
||||||
},
|
},
|
||||||
receivers: self.receivers.clone(),
|
receivers: self.receivers.clone(),
|
||||||
|
rules: self.rules.clone(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
fn name(&self) -> String {
|
fn name(&self) -> String {
|
||||||
"HelmPrometheusAlertingScore".to_string()
|
"HelmPrometheusAlertingScore".to_string()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
|
||||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
self.clone_box()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -2,13 +2,17 @@ use std::sync::{Arc, Mutex};
|
|||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
interpret::{InterpretError, Outcome},
|
interpret::{InterpretError, Outcome},
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
|
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||||
score,
|
score,
|
||||||
topology::{
|
topology::{
|
||||||
HelmCommand, Topology, installable::Installable, oberservability::monitoring::AlertSender,
|
HelmCommand, Topology,
|
||||||
|
installable::Installable,
|
||||||
|
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -18,7 +22,7 @@ use super::{
|
|||||||
helm::{
|
helm::{
|
||||||
config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
|
config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
|
||||||
},
|
},
|
||||||
types::AlertManagerChannelConfig,
|
types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -35,7 +39,6 @@ impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
|
|||||||
inventory: &Inventory,
|
inventory: &Inventory,
|
||||||
topology: &T,
|
topology: &T,
|
||||||
) -> Result<(), InterpretError> {
|
) -> Result<(), InterpretError> {
|
||||||
//install_prometheus
|
|
||||||
self.install_prometheus(inventory, topology).await?;
|
self.install_prometheus(inventory, topology).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -67,6 +70,20 @@ impl Prometheus {
|
|||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn install_rule(
|
||||||
|
&self,
|
||||||
|
prometheus_rule: &AlertManagerRuleGroup,
|
||||||
|
) -> Result<Outcome, InterpretError> {
|
||||||
|
let prometheus_rule = prometheus_rule.configure_rule().await;
|
||||||
|
let mut config = self.config.lock().unwrap();
|
||||||
|
|
||||||
|
config.alert_rules.push(prometheus_rule.clone());
|
||||||
|
Ok(Outcome::success(format!(
|
||||||
|
"Successfully installed alert rule: {:#?},",
|
||||||
|
prometheus_rule
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
|
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
|
||||||
&self,
|
&self,
|
||||||
inventory: &Inventory,
|
inventory: &Inventory,
|
||||||
@ -84,3 +101,39 @@ pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
|
|||||||
fn name(&self) -> String;
|
fn name(&self) -> String;
|
||||||
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
|
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
|
||||||
|
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
self.clone_box()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
|
||||||
|
fn name(&self) -> String;
|
||||||
|
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for Box<dyn AlertRule<Prometheus>> {
|
||||||
|
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for Box<dyn AlertRule<Prometheus>> {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
self.clone_box()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_yaml::{Mapping, Sequence, Value};
|
use serde_yaml::{Mapping, Sequence, Value};
|
||||||
|
|
||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait AlertChannelConfig {
|
pub trait AlertChannelConfig {
|
||||||
async fn get_config(&self) -> AlertManagerChannelConfig;
|
async fn get_config(&self) -> AlertManagerChannelConfig;
|
||||||
@ -38,3 +42,14 @@ pub struct AlertManagerChannelConfig {
|
|||||||
pub channel_route: Value,
|
pub channel_route: Value,
|
||||||
pub channel_receiver: Value,
|
pub channel_receiver: Value,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct AlertManagerAdditionalPromRules {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub rules: BTreeMap<String, AlertGroup>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct AlertGroup {
|
||||||
|
pub groups: Vec<AlertManagerRuleGroup>,
|
||||||
|
}
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
pub mod alert_channel;
|
pub mod alert_channel;
|
||||||
|
pub mod alert_rule;
|
||||||
pub mod kube_prometheus;
|
pub mod kube_prometheus;
|
||||||
|
40
harmony/src/modules/prometheus/alerts/infra/dell_server.rs
Normal file
40
harmony/src/modules/prometheus/alerts/infra/dell_server.rs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn global_storage_status_degraded_non_critical() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new("GlobalStorageStatusNonCritical", "globalStorageStatus == 4")
|
||||||
|
.for_duration("5m")
|
||||||
|
.label("severity", "warning")
|
||||||
|
.annotation(
|
||||||
|
"description",
|
||||||
|
"- **system**: {{ $labels.instance }}\n- **Status**: nonCritical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||||
|
)
|
||||||
|
.annotation("title", " System storage status is in degraded state")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alert_global_storage_status_critical() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new(
|
||||||
|
"GlobalStorageStatus critical",
|
||||||
|
"globalStorageStatus == 5",
|
||||||
|
)
|
||||||
|
.for_duration("5m")
|
||||||
|
.label("severity", "warning")
|
||||||
|
.annotation("title", "System storage status is critical at {{ $labels.instance }}")
|
||||||
|
.annotation(
|
||||||
|
"description",
|
||||||
|
"- **System**: {{ $labels.instance }}\n- **Status**: Critical\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alert_global_storage_status_non_recoverable() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new(
|
||||||
|
"GlobalStorageStatus nonRecoverable",
|
||||||
|
"globalStorageStatus == 6",
|
||||||
|
)
|
||||||
|
.for_duration("5m")
|
||||||
|
.label("severity", "warning")
|
||||||
|
.annotation("title", "System storage status is nonRecoverable at {{ $labels.instance }}")
|
||||||
|
.annotation(
|
||||||
|
"description",
|
||||||
|
"- **System**: {{ $labels.instance }}\n- **Status**: nonRecoverable\n- **Value**: {{ $value }}\n- **Job**: {{ $labels.job }}",
|
||||||
|
)
|
||||||
|
}
|
1
harmony/src/modules/prometheus/alerts/infra/mod.rs
Normal file
1
harmony/src/modules/prometheus/alerts/infra/mod.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod dell_server;
|
1
harmony/src/modules/prometheus/alerts/k8s/mod.rs
Normal file
1
harmony/src/modules/prometheus/alerts/k8s/mod.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod pvc;
|
11
harmony/src/modules/prometheus/alerts/k8s/pvc.rs
Normal file
11
harmony/src/modules/prometheus/alerts/k8s/pvc.rs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn high_pvc_fill_rate_over_two_days() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new(
|
||||||
|
"PVC Fill Over 95 Percent In 2 Days",
|
||||||
|
"(kubelet_volume_stats_used_bytes/kubelet_volume_stats_capacity_bytes) > 0.95 AND predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)/kubelet_volume_stats_capacity_bytes > 0.95",)
|
||||||
|
.for_duration("1m")
|
||||||
|
.label("severity", "warning")
|
||||||
|
.annotation("summary", "The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.")
|
||||||
|
.annotation("description", "PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days",)
|
||||||
|
}
|
2
harmony/src/modules/prometheus/alerts/mod.rs
Normal file
2
harmony/src/modules/prometheus/alerts/mod.rs
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pub mod infra;
|
||||||
|
pub mod k8s;
|
1
harmony/src/modules/prometheus/mod.rs
Normal file
1
harmony/src/modules/prometheus/mod.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod alerts;
|
Loading…
Reference in New Issue
Block a user