Merge pull request 'feat:added Slack notifications support' (#38) from feat/slack-notifs into master

Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/38
Reviewed-by: johnride <jg@nationtech.io>
This commit is contained in:
wjro 2025-05-22 20:04:51 +00:00
commit 9c51040f3b
4 changed files with 167 additions and 108 deletions

View File

@ -1,8 +1,7 @@
use email_address::EmailAddress;
use serde::Serialize; use serde::Serialize;
use url::Url;
use super::monitoring_alerting::WebhookServiceType; use super::monitoring_alerting::AlertChannel;
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub struct KubePrometheusConfig { pub struct KubePrometheusConfig {
@ -23,9 +22,7 @@ pub struct KubePrometheusConfig {
pub kube_proxy: bool, pub kube_proxy: bool,
pub kube_state_metrics: bool, pub kube_state_metrics: bool,
pub prometheus_operator: bool, pub prometheus_operator: bool,
pub webhook_url: Option<Url>, pub alert_channel: Vec<AlertChannel>,
pub webhook_service_type: Option<WebhookServiceType>,
pub discord_alert_manager_release_name: String,
} }
impl KubePrometheusConfig { impl KubePrometheusConfig {
pub fn new() -> Self { pub fn new() -> Self {
@ -34,8 +31,7 @@ impl KubePrometheusConfig {
default_rules: true, default_rules: true,
windows_monitoring: false, windows_monitoring: false,
alert_manager: true, alert_manager: true,
webhook_service_type: None, alert_channel: Vec::new(),
webhook_url: None,
grafana: true, grafana: true,
node_exporter: false, node_exporter: false,
prometheus: true, prometheus: true,
@ -49,7 +45,6 @@ impl KubePrometheusConfig {
prometheus_operator: true, prometheus_operator: true,
core_dns: false, core_dns: false,
kube_scheduler: false, kube_scheduler: false,
discord_alert_manager_release_name: "discord-alert-manager".into(),
} }
} }
} }

View File

@ -4,28 +4,29 @@ use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore; use crate::modules::helm::chart::HelmChartScore;
use super::config::KubePrometheusConfig; use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel};
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore { fn get_discord_alert_manager_score(config: &KubePrometheusConfig) -> Option<HelmChartScore> {
let url = if let Some(url) = &config.webhook_url { let (url, name) = config.alert_channel.iter().find_map(|channel| {
url.to_string() if let AlertChannel::Discord { webhook_url, name } = channel {
Some((webhook_url, name))
} else { } else {
"None".to_string() None
}; }
})?;
let values = format!( let values = format!(
r#" r#"
environment: environment:
- name: "DISCORD_WEBHOOK" - name: "DISCORD_WEBHOOK"
value: "{url}" value: "{url}"
"#, "#,
); );
HelmChartScore { Some(HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(), release_name: NonBlankString::from_str(&name).unwrap(),
chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord") chart_name: NonBlankString::from_str("oci://hub.nationtech.io/library/alertmanager-discord")
.unwrap(), .unwrap(),
chart_version: None, chart_version: None,
values_overrides: None, values_overrides: None,
@ -33,5 +34,13 @@ environment:
create_namespace: true, create_namespace: true,
install_only: true, install_only: true,
repository: None, repository: None,
})
}
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore {
if let Some(chart) = get_discord_alert_manager_score(config) {
chart
} else {
panic!("Expected discord alert manager helm chart");
} }
} }

View File

@ -1,6 +1,8 @@
use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType}; use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel};
use log::info;
use non_blank_string_rs::NonBlankString; use non_blank_string_rs::NonBlankString;
use std::{collections::HashMap, str::FromStr}; use std::{collections::HashMap, str::FromStr};
use url::Url;
use crate::modules::helm::chart::HelmChartScore; use crate::modules::helm::chart::HelmChartScore;
@ -10,14 +12,6 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh
let default_rules = config.default_rules.to_string(); let default_rules = config.default_rules.to_string();
let windows_monitoring = config.windows_monitoring.to_string(); let windows_monitoring = config.windows_monitoring.to_string();
let alert_manager = config.alert_manager.to_string(); let alert_manager = config.alert_manager.to_string();
let webhook_service_type = if let Some(service) = &config.webhook_service_type {
match service {
WebhookServiceType::Discord => "Discord".to_string(),
WebhookServiceType::Slack => "Slack".to_string(),
}
} else {
"None".to_string()
};
let grafana = config.grafana.to_string(); let grafana = config.grafana.to_string();
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string(); let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
let kubernetes_api_server = config.kubernetes_api_server.to_string(); let kubernetes_api_server = config.kubernetes_api_server.to_string();
@ -31,8 +25,7 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh
let node_exporter = config.node_exporter.to_string(); let node_exporter = config.node_exporter.to_string();
let prometheus_operator = config.prometheus_operator.to_string(); let prometheus_operator = config.prometheus_operator.to_string();
let prometheus = config.prometheus.to_string(); let prometheus = config.prometheus.to_string();
let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string(); let mut values = format!(
let values = format!(
r#" r#"
additionalPrometheusRulesMap: additionalPrometheusRulesMap:
pods-status-alerts: pods-status-alerts:
@ -45,23 +38,23 @@ additionalPrometheusRulesMap:
labels: labels:
severity: critical severity: critical
annotations: annotations:
title: "[CRIT] POD not healthy : {{ $labels.pod }}" title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
description: | description: |
A POD is in a non-ready state! A POD is in a non-ready state!
- **Pod**: {{ $labels.pod }} - **Pod**: {{{{ $labels.pod }}}}
- **Namespace**: {{ $labels.namespace }} - **Namespace**: {{{{ $labels.namespace }}}}
- alert: "[CRIT] POD crash looping" - alert: "[CRIT] POD crash looping"
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
title: "[CRIT] POD crash looping : {{ $labels.pod }}" title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
description: | description: |
A POD is drowning in a crash loop! A POD is drowning in a crash loop!
- **Pod**: {{ $labels.pod }} - **Pod**: {{{{ $labels.pod }}}}
- **Namespace**: {{ $labels.namespace }} - **Namespace**: {{{{ $labels.namespace }}}}
- **Instance**: {{ $labels.instance }} - **Instance**: {{{{ $labels.instance }}}}
pvc-alerts: pvc-alerts:
groups: groups:
- name: pvc-alerts - name: pvc-alerts
@ -82,8 +75,8 @@ additionalPrometheusRulesMap:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days. description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
defaultRules: defaultRules:
create: {default_rules} create: {default_rules}
rules: rules:
@ -123,26 +116,6 @@ defaultRules:
windows: true windows: true
windowsMonitoring: windowsMonitoring:
enabled: {windows_monitoring} enabled: {windows_monitoring}
alertmanager:
enabled: {alert_manager}
config:
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: '{webhook_service_type}'
routes:
- receiver: 'null'
matchers:
- alertname="Watchdog"
continue: false
receivers:
- name: 'null'
- name: '{webhook_service_type}'
webhook_configs:
- url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094'
send_resolved: true
grafana: grafana:
enabled: {grafana} enabled: {grafana}
kubernetesServiceMonitors: kubernetesServiceMonitors:
@ -172,6 +145,66 @@ prometheus:
"#, "#,
); );
let alertmanager_config = alert_manager_yaml_builder(&config);
values.push_str(&alertmanager_config);
fn alert_manager_yaml_builder(config: &KubePrometheusConfig) -> String {
let mut receivers = String::new();
let mut routes = String::new();
let mut global_configs = String::new();
let alert_manager = config.alert_manager;
for alert_channel in &config.alert_channel {
match alert_channel {
AlertChannel::Discord { name, .. } => {
let (receiver, route) = discord_alert_builder(name);
info!("discord receiver: {} \nroute: {}", receiver, route);
receivers.push_str(&receiver);
routes.push_str(&route);
}
AlertChannel::Slack {
slack_channel,
webhook_url,
} => {
let (receiver, route) = slack_alert_builder(slack_channel);
info!("slack receiver: {} \nroute: {}", receiver, route);
receivers.push_str(&receiver);
routes.push_str(&route);
let global_config = format!(
r#"
global:
slack_api_url: {webhook_url}"#
);
global_configs.push_str(&global_config);
}
AlertChannel::Smpt { .. } => todo!(),
}
}
info!("after alert receiver: {}", receivers);
info!("after alert routes: {}", routes);
let alertmanager_config = format!(
r#"
alertmanager:
enabled: {alert_manager}
config: {global_configs}
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
routes:
{routes}
receivers:
- name: 'null'
{receivers}"#
);
info!("alert manager config: {}", alertmanager_config);
alertmanager_config
}
HelmChartScore { HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(), release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
@ -187,3 +220,43 @@ prometheus:
repository: None, repository: None,
} }
} }
fn discord_alert_builder(release_name: &String) -> (String, String) {
let discord_receiver_name = format!("Discord-{}", release_name);
let receiver = format!(
r#"
- name: '{discord_receiver_name}'
webhook_configs:
- url: 'http://{release_name}-alertmanager-discord:9094'
send_resolved: true"#,
);
let route = format!(
r#"
- receiver: '{discord_receiver_name}'
matchers:
- alertname!=Watchdog
continue: true"#,
);
(receiver, route)
}
fn slack_alert_builder(slack_channel: &String) -> (String, String) {
let slack_receiver_name = format!("Slack-{}", slack_channel);
let receiver = format!(
r#"
- name: '{slack_receiver_name}'
slack_configs:
- channel: '{slack_channel}'
send_resolved: true
title: '{{{{ .CommonAnnotations.title }}}}'
text: '{{{{ .CommonAnnotations.description }}}}'"#,
);
let route = format!(
r#"
- receiver: '{slack_receiver_name}'
matchers:
- alertname!=Watchdog
continue: true"#,
);
(receiver, route)
}

View File

@ -20,9 +20,13 @@ use super::{
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub enum AlertChannel { pub enum AlertChannel {
WebHookUrl { Discord {
url: Url, name: String,
webhook_service_type: WebhookServiceType, webhook_url: Url,
},
Slack {
slack_channel: String,
webhook_url: Url,
}, },
//TODO test and implement in helm chart //TODO test and implement in helm chart
//currently does not work //currently does not work
@ -32,46 +36,19 @@ pub enum AlertChannel {
}, },
} }
#[derive(Debug, Clone, Serialize)]
pub enum WebhookServiceType {
Discord,
//TODO test slack notifications
Slack,
}
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub struct MonitoringAlertingStackScore { pub struct MonitoringAlertingStackScore {
pub alert_channel: Option<AlertChannel>, pub alert_channel: Vec<AlertChannel>,
pub namespace: Option<String>, pub namespace: Option<String>,
} }
impl MonitoringAlertingStackScore { impl MonitoringAlertingStackScore {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
alert_channel: None, alert_channel: Vec::new(),
namespace: None, namespace: None,
} }
} }
fn set_alert_channel(&self, config: &mut KubePrometheusConfig) {
if let Some(alert_channel) = &self.alert_channel {
match alert_channel {
AlertChannel::WebHookUrl {
url,
webhook_service_type,
} => {
config.webhook_url = Some(url.clone());
config.webhook_service_type = Some(webhook_service_type.clone());
}
AlertChannel::Smpt {
//TODO setup smpt alerts
email_address,
service_name,
} => {
todo!()
}
}
}
}
} }
impl<T: Topology + HelmCommand> Score<T> for MonitoringAlertingStackScore { impl<T: Topology + HelmCommand> Score<T> for MonitoringAlertingStackScore {
@ -93,10 +70,10 @@ struct MonitoringAlertingStackInterpret {
impl MonitoringAlertingStackInterpret { impl MonitoringAlertingStackInterpret {
async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig { async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig {
let mut config = KubePrometheusConfig::new(); let mut config = KubePrometheusConfig::new();
self.score.set_alert_channel(&mut config);
if let Some(ns) = &self.score.namespace { if let Some(ns) = &self.score.namespace {
config.namespace = ns.clone(); config.namespace = ns.clone();
} }
config.alert_channel = self.score.alert_channel.clone();
config config
} }
@ -119,26 +96,30 @@ impl MonitoringAlertingStackInterpret {
topology: &T, topology: &T,
config: &KubePrometheusConfig, config: &KubePrometheusConfig,
) -> Result<Outcome, InterpretError> { ) -> Result<Outcome, InterpretError> {
match &self.score.alert_channel { let mut outcomes = vec![];
Some(AlertChannel::WebHookUrl {
webhook_service_type, for channel in &self.score.alert_channel {
.. let outcome = match channel {
}) => match webhook_service_type { AlertChannel::Discord { .. } => {
WebhookServiceType::Discord => {
discord_alert_manager_score(config) discord_alert_manager_score(config)
.create_interpret() .create_interpret()
.execute(inventory, topology) .execute(inventory, topology)
.await .await
} }
WebhookServiceType::Slack => Ok(Outcome::success( AlertChannel::Slack { .. } => Ok(Outcome::success(
"No extra configs for slack alerting".to_string(), "No extra configs for slack alerting".to_string(),
)), )),
}, AlertChannel::Smpt { .. } => {
Some(AlertChannel::Smpt { .. }) => {
todo!() todo!()
} }
None => Ok(Outcome::success("No alert channel configured".to_string())), };
outcomes.push(outcome);
} }
for result in outcomes {
result?;
}
Ok(Outcome::success("All alert channels deployed".to_string()))
} }
} }
@ -155,7 +136,8 @@ impl<T: Topology + HelmCommand> Interpret<T> for MonitoringAlertingStackInterpre
self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config) self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config)
.await?; .await?;
info!("Installing alert channel service"); info!("Installing alert channel service");
self.deploy_alert_channel_service(inventory, topology, &config).await?; self.deploy_alert_channel_service(inventory, topology, &config)
.await?;
Ok(Outcome::success(format!( Ok(Outcome::success(format!(
"succesfully deployed monitoring and alerting stack" "succesfully deployed monitoring and alerting stack"
))) )))