Compare commits
1 Commits
e2e-tests-
...
feat/clust
| Author | SHA1 | Date | |
|---|---|---|---|
| 18d8ba2210 |
@@ -3,12 +3,10 @@ use harmony::{
|
||||
modules::monitoring::{
|
||||
alert_channel::discord_alert_channel::DiscordReceiver,
|
||||
alert_rule::{
|
||||
alerts::{
|
||||
infra::opnsense::high_http_error_rate, k8s::pvc::high_pvc_fill_rate_over_two_days,
|
||||
},
|
||||
alerts::infra::opnsense::high_http_error_rate,
|
||||
prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
},
|
||||
okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
|
||||
cluster_alerting::ClusterAlertingScore,
|
||||
scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
|
||||
},
|
||||
topology::{
|
||||
@@ -21,22 +19,37 @@ use harmony_macros::{hurl, ip};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let platform_matcher = AlertMatcher {
|
||||
label: "prometheus".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "openshift-monitoring/k8s".to_string(),
|
||||
};
|
||||
let severity = AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "critical".to_string(),
|
||||
let critical_receiver = DiscordReceiver {
|
||||
name: "critical-alerts".to_string(),
|
||||
url: hurl!("https://discord.example.com/webhook/critical"),
|
||||
route: AlertRoute {
|
||||
matchers: vec![AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "critical".to_string(),
|
||||
}],
|
||||
..AlertRoute::default("critical-alerts".to_string())
|
||||
},
|
||||
};
|
||||
|
||||
let high_http_error_rate = high_http_error_rate();
|
||||
let warning_receiver = DiscordReceiver {
|
||||
name: "warning-alerts".to_string(),
|
||||
url: hurl!("https://discord.example.com/webhook/warning"),
|
||||
route: AlertRoute {
|
||||
matchers: vec![AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "warning".to_string(),
|
||||
}],
|
||||
repeat_interval: Some("30m".to_string()),
|
||||
..AlertRoute::default("warning-alerts".to_string())
|
||||
},
|
||||
};
|
||||
|
||||
let additional_rules = AlertManagerRuleGroup::new("test-rule", vec![high_http_error_rate]);
|
||||
let additional_rules =
|
||||
AlertManagerRuleGroup::new("infra-alerts", vec![high_http_error_rate()]);
|
||||
|
||||
let scrape_target = PrometheusNodeExporter {
|
||||
let firewall_scraper = PrometheusNodeExporter {
|
||||
job_name: "firewall".to_string(),
|
||||
metrics_path: "/metrics".to_string(),
|
||||
listen_address: ip!("192.168.1.1"),
|
||||
@@ -44,22 +57,16 @@ async fn main() {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let alerting_score = ClusterAlertingScore::new()
|
||||
.critical_receiver(Box::new(critical_receiver))
|
||||
.warning_receiver(Box::new(warning_receiver))
|
||||
.additional_rule(Box::new(additional_rules))
|
||||
.scrape_target(Box::new(firewall_scraper));
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(OpenshiftClusterAlertScore {
|
||||
receivers: vec![Box::new(DiscordReceiver {
|
||||
name: "crit-wills-discord-channel-example".to_string(),
|
||||
url: hurl!("https://test.io"),
|
||||
route: AlertRoute {
|
||||
matchers: vec![severity],
|
||||
..AlertRoute::default("crit-wills-discord-channel-example".to_string())
|
||||
},
|
||||
})],
|
||||
sender: harmony::modules::monitoring::okd::OpenshiftClusterAlertSender,
|
||||
rules: vec![Box::new(additional_rules)],
|
||||
scrape_targets: Some(vec![Box::new(scrape_target)]),
|
||||
})],
|
||||
vec![Box::new(alerting_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -0,0 +1,194 @@
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::Interpret,
|
||||
modules::monitoring::{
|
||||
alert_rule::{
|
||||
alerts::k8s::{
|
||||
deployment::alert_deployment_unavailable, memory_usage::alert_high_cpu_usage,
|
||||
memory_usage::alert_high_memory_usage, pod::alert_container_restarting,
|
||||
pod::alert_pod_not_ready, pod::pod_failed, pvc::high_pvc_fill_rate_over_two_days,
|
||||
},
|
||||
prometheus_alert_rule::AlertManagerRuleGroup,
|
||||
},
|
||||
okd::OpenshiftClusterAlertSender,
|
||||
},
|
||||
score::Score,
|
||||
topology::{
|
||||
monitoring::{
|
||||
AlertReceiver, AlertRoute, AlertRule, AlertingInterpret, MatchOp, Observability,
|
||||
ScrapeTarget,
|
||||
},
|
||||
Topology,
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterAlertingScore {
|
||||
pub critical_alerts_receiver: Option<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>>,
|
||||
pub warning_alerts_receiver: Option<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>>,
|
||||
pub additional_rules: Vec<Box<dyn AlertRule<OpenshiftClusterAlertSender>>>,
|
||||
pub scrape_targets: Option<Vec<Box<dyn ScrapeTarget<OpenshiftClusterAlertSender>>>>,
|
||||
pub include_default_rules: bool,
|
||||
}
|
||||
|
||||
impl ClusterAlertingScore {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
critical_alerts_receiver: None,
|
||||
warning_alerts_receiver: None,
|
||||
additional_rules: vec![],
|
||||
scrape_targets: None,
|
||||
include_default_rules: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn critical_receiver(
|
||||
mut self,
|
||||
receiver: Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>,
|
||||
) -> Self {
|
||||
self.critical_alerts_receiver = Some(receiver);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn warning_receiver(
|
||||
mut self,
|
||||
receiver: Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>,
|
||||
) -> Self {
|
||||
self.warning_alerts_receiver = Some(receiver);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn additional_rule(
|
||||
mut self,
|
||||
rule: Box<dyn AlertRule<OpenshiftClusterAlertSender>>,
|
||||
) -> Self {
|
||||
self.additional_rules.push(rule);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn scrape_target(
|
||||
mut self,
|
||||
target: Box<dyn ScrapeTarget<OpenshiftClusterAlertSender>>,
|
||||
) -> Self {
|
||||
self.scrape_targets
|
||||
.get_or_insert_with(Vec::new)
|
||||
.push(target);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_default_rules(mut self, include: bool) -> Self {
|
||||
self.include_default_rules = include;
|
||||
self
|
||||
}
|
||||
|
||||
fn build_default_rules(&self) -> Vec<Box<dyn AlertRule<OpenshiftClusterAlertSender>>> {
|
||||
if !self.include_default_rules {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let critical_rules =
|
||||
AlertManagerRuleGroup::new("cluster-critical-alerts", vec![pod_failed()]);
|
||||
|
||||
let warning_rules = AlertManagerRuleGroup::new(
|
||||
"cluster-warning-alerts",
|
||||
vec![
|
||||
alert_deployment_unavailable(),
|
||||
alert_container_restarting(),
|
||||
alert_pod_not_ready(),
|
||||
alert_high_memory_usage(),
|
||||
alert_high_cpu_usage(),
|
||||
high_pvc_fill_rate_over_two_days(),
|
||||
],
|
||||
);
|
||||
|
||||
vec![Box::new(critical_rules), Box::new(warning_rules)]
|
||||
}
|
||||
|
||||
fn build_receivers(&self) -> Vec<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>> {
|
||||
let mut receivers = vec![];
|
||||
|
||||
if let Some(ref critical_receiver) = self.critical_alerts_receiver {
|
||||
receivers.push(critical_receiver.clone());
|
||||
}
|
||||
|
||||
if let Some(ref warning_receiver) = self.warning_alerts_receiver {
|
||||
receivers.push(warning_receiver.clone());
|
||||
}
|
||||
|
||||
receivers
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ClusterAlertingScore {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T> for ClusterAlertingScore {
|
||||
fn name(&self) -> String {
|
||||
"ClusterAlertingScore".to_string()
|
||||
}
|
||||
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
let mut all_rules = self.build_default_rules();
|
||||
all_rules.extend(self.additional_rules.clone());
|
||||
|
||||
let receivers = self.build_receivers();
|
||||
|
||||
Box::new(AlertingInterpret {
|
||||
sender: OpenshiftClusterAlertSender,
|
||||
receivers,
|
||||
rules: all_rules,
|
||||
scrape_targets: self.scrape_targets.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ClusterAlertingScore {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serde_json::json!({
|
||||
"name": "ClusterAlertingScore",
|
||||
"include_default_rules": self.include_default_rules,
|
||||
"has_critical_receiver": self.critical_alerts_receiver.is_some(),
|
||||
"has_warning_receiver": self.warning_alerts_receiver.is_some(),
|
||||
"additional_rules_count": self.additional_rules.len(),
|
||||
"scrape_targets_count": self.scrape_targets.as_ref().map(|t| t.len()).unwrap_or(0),
|
||||
})
|
||||
.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn critical_route() -> AlertRoute {
|
||||
AlertRoute {
|
||||
receiver: "critical".to_string(),
|
||||
matchers: vec![crate::topology::monitoring::AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "critical".to_string(),
|
||||
}],
|
||||
group_by: vec![],
|
||||
repeat_interval: Some("5m".to_string()),
|
||||
continue_matching: false,
|
||||
children: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn warning_route() -> AlertRoute {
|
||||
AlertRoute {
|
||||
receiver: "warning".to_string(),
|
||||
matchers: vec![crate::topology::monitoring::AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "warning".to_string(),
|
||||
}],
|
||||
group_by: vec![],
|
||||
repeat_interval: Some("30m".to_string()),
|
||||
continue_matching: false,
|
||||
children: vec![],
|
||||
}
|
||||
}
|
||||
3
harmony/src/modules/monitoring/cluster_alerting/mod.rs
Normal file
3
harmony/src/modules/monitoring/cluster_alerting/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod cluster_alerting_score;
|
||||
|
||||
pub use cluster_alerting_score::{critical_route, warning_route, ClusterAlertingScore};
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod application_monitoring;
|
||||
pub mod cluster_alerting;
|
||||
pub mod grafana;
|
||||
pub mod kube_prometheus;
|
||||
pub mod ntfy;
|
||||
|
||||
Reference in New Issue
Block a user