Compare commits

..

1 Commits

4 changed files with 234 additions and 29 deletions

View File

@@ -3,12 +3,10 @@ use harmony::{
modules::monitoring::{
alert_channel::discord_alert_channel::DiscordReceiver,
alert_rule::{
alerts::{
infra::opnsense::high_http_error_rate, k8s::pvc::high_pvc_fill_rate_over_two_days,
},
alerts::infra::opnsense::high_http_error_rate,
prometheus_alert_rule::AlertManagerRuleGroup,
},
okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
cluster_alerting::ClusterAlertingScore,
scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
},
topology::{
@@ -21,22 +19,37 @@ use harmony_macros::{hurl, ip};
#[tokio::main]
async fn main() {
let platform_matcher = AlertMatcher {
label: "prometheus".to_string(),
operator: MatchOp::Eq,
value: "openshift-monitoring/k8s".to_string(),
};
let severity = AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "critical".to_string(),
let critical_receiver = DiscordReceiver {
name: "critical-alerts".to_string(),
url: hurl!("https://discord.example.com/webhook/critical"),
route: AlertRoute {
matchers: vec![AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "critical".to_string(),
}],
..AlertRoute::default("critical-alerts".to_string())
},
};
let high_http_error_rate = high_http_error_rate();
let warning_receiver = DiscordReceiver {
name: "warning-alerts".to_string(),
url: hurl!("https://discord.example.com/webhook/warning"),
route: AlertRoute {
matchers: vec![AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "warning".to_string(),
}],
repeat_interval: Some("30m".to_string()),
..AlertRoute::default("warning-alerts".to_string())
},
};
let additional_rules = AlertManagerRuleGroup::new("test-rule", vec![high_http_error_rate]);
let additional_rules =
AlertManagerRuleGroup::new("infra-alerts", vec![high_http_error_rate()]);
let scrape_target = PrometheusNodeExporter {
let firewall_scraper = PrometheusNodeExporter {
job_name: "firewall".to_string(),
metrics_path: "/metrics".to_string(),
listen_address: ip!("192.168.1.1"),
@@ -44,22 +57,16 @@ async fn main() {
..Default::default()
};
let alerting_score = ClusterAlertingScore::new()
.critical_receiver(Box::new(critical_receiver))
.warning_receiver(Box::new(warning_receiver))
.additional_rule(Box::new(additional_rules))
.scrape_target(Box::new(firewall_scraper));
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(OpenshiftClusterAlertScore {
receivers: vec![Box::new(DiscordReceiver {
name: "crit-wills-discord-channel-example".to_string(),
url: hurl!("https://test.io"),
route: AlertRoute {
matchers: vec![severity],
..AlertRoute::default("crit-wills-discord-channel-example".to_string())
},
})],
sender: harmony::modules::monitoring::okd::OpenshiftClusterAlertSender,
rules: vec![Box::new(additional_rules)],
scrape_targets: Some(vec![Box::new(scrape_target)]),
})],
vec![Box::new(alerting_score)],
None,
)
.await

View File

@@ -0,0 +1,194 @@
use serde::Serialize;
use crate::{
interpret::Interpret,
modules::monitoring::{
alert_rule::{
alerts::k8s::{
deployment::alert_deployment_unavailable, memory_usage::alert_high_cpu_usage,
memory_usage::alert_high_memory_usage, pod::alert_container_restarting,
pod::alert_pod_not_ready, pod::pod_failed, pvc::high_pvc_fill_rate_over_two_days,
},
prometheus_alert_rule::AlertManagerRuleGroup,
},
okd::OpenshiftClusterAlertSender,
},
score::Score,
topology::{
monitoring::{
AlertReceiver, AlertRoute, AlertRule, AlertingInterpret, MatchOp, Observability,
ScrapeTarget,
},
Topology,
},
};
#[derive(Debug, Clone)]
pub struct ClusterAlertingScore {
pub critical_alerts_receiver: Option<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>>,
pub warning_alerts_receiver: Option<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>>,
pub additional_rules: Vec<Box<dyn AlertRule<OpenshiftClusterAlertSender>>>,
pub scrape_targets: Option<Vec<Box<dyn ScrapeTarget<OpenshiftClusterAlertSender>>>>,
pub include_default_rules: bool,
}
impl ClusterAlertingScore {
pub fn new() -> Self {
Self {
critical_alerts_receiver: None,
warning_alerts_receiver: None,
additional_rules: vec![],
scrape_targets: None,
include_default_rules: true,
}
}
pub fn critical_receiver(
mut self,
receiver: Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>,
) -> Self {
self.critical_alerts_receiver = Some(receiver);
self
}
pub fn warning_receiver(
mut self,
receiver: Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>,
) -> Self {
self.warning_alerts_receiver = Some(receiver);
self
}
pub fn additional_rule(
mut self,
rule: Box<dyn AlertRule<OpenshiftClusterAlertSender>>,
) -> Self {
self.additional_rules.push(rule);
self
}
pub fn scrape_target(
mut self,
target: Box<dyn ScrapeTarget<OpenshiftClusterAlertSender>>,
) -> Self {
self.scrape_targets
.get_or_insert_with(Vec::new)
.push(target);
self
}
pub fn with_default_rules(mut self, include: bool) -> Self {
self.include_default_rules = include;
self
}
fn build_default_rules(&self) -> Vec<Box<dyn AlertRule<OpenshiftClusterAlertSender>>> {
if !self.include_default_rules {
return vec![];
}
let critical_rules =
AlertManagerRuleGroup::new("cluster-critical-alerts", vec![pod_failed()]);
let warning_rules = AlertManagerRuleGroup::new(
"cluster-warning-alerts",
vec![
alert_deployment_unavailable(),
alert_container_restarting(),
alert_pod_not_ready(),
alert_high_memory_usage(),
alert_high_cpu_usage(),
high_pvc_fill_rate_over_two_days(),
],
);
vec![Box::new(critical_rules), Box::new(warning_rules)]
}
fn build_receivers(&self) -> Vec<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>> {
let mut receivers = vec![];
if let Some(ref critical_receiver) = self.critical_alerts_receiver {
receivers.push(critical_receiver.clone());
}
if let Some(ref warning_receiver) = self.warning_alerts_receiver {
receivers.push(warning_receiver.clone());
}
receivers
}
}
impl Default for ClusterAlertingScore {
fn default() -> Self {
Self::new()
}
}
impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T> for ClusterAlertingScore {
fn name(&self) -> String {
"ClusterAlertingScore".to_string()
}
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
let mut all_rules = self.build_default_rules();
all_rules.extend(self.additional_rules.clone());
let receivers = self.build_receivers();
Box::new(AlertingInterpret {
sender: OpenshiftClusterAlertSender,
receivers,
rules: all_rules,
scrape_targets: self.scrape_targets.clone(),
})
}
}
impl Serialize for ClusterAlertingScore {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serde_json::json!({
"name": "ClusterAlertingScore",
"include_default_rules": self.include_default_rules,
"has_critical_receiver": self.critical_alerts_receiver.is_some(),
"has_warning_receiver": self.warning_alerts_receiver.is_some(),
"additional_rules_count": self.additional_rules.len(),
"scrape_targets_count": self.scrape_targets.as_ref().map(|t| t.len()).unwrap_or(0),
})
.serialize(serializer)
}
}
pub fn critical_route() -> AlertRoute {
AlertRoute {
receiver: "critical".to_string(),
matchers: vec![crate::topology::monitoring::AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "critical".to_string(),
}],
group_by: vec![],
repeat_interval: Some("5m".to_string()),
continue_matching: false,
children: vec![],
}
}
pub fn warning_route() -> AlertRoute {
AlertRoute {
receiver: "warning".to_string(),
matchers: vec![crate::topology::monitoring::AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "warning".to_string(),
}],
group_by: vec![],
repeat_interval: Some("30m".to_string()),
continue_matching: false,
children: vec![],
}
}

View File

@@ -0,0 +1,3 @@
mod cluster_alerting_score;
pub use cluster_alerting_score::{critical_route, warning_route, ClusterAlertingScore};

View File

@@ -1,6 +1,7 @@
pub mod alert_channel;
pub mod alert_rule;
pub mod application_monitoring;
pub mod cluster_alerting;
pub mod grafana;
pub mod kube_prometheus;
pub mod ntfy;