Merge pull request 'feat:added Slack notifications support' (#38) from feat/slack-notifs into master
Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/38 Reviewed-by: johnride <jg@nationtech.io>
This commit is contained in:
commit
9c51040f3b
@ -1,8 +1,7 @@
|
||||
use email_address::EmailAddress;
|
||||
use serde::Serialize;
|
||||
use url::Url;
|
||||
|
||||
use super::monitoring_alerting::WebhookServiceType;
|
||||
use super::monitoring_alerting::AlertChannel;
|
||||
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct KubePrometheusConfig {
|
||||
@ -23,9 +22,7 @@ pub struct KubePrometheusConfig {
|
||||
pub kube_proxy: bool,
|
||||
pub kube_state_metrics: bool,
|
||||
pub prometheus_operator: bool,
|
||||
pub webhook_url: Option<Url>,
|
||||
pub webhook_service_type: Option<WebhookServiceType>,
|
||||
pub discord_alert_manager_release_name: String,
|
||||
pub alert_channel: Vec<AlertChannel>,
|
||||
}
|
||||
impl KubePrometheusConfig {
|
||||
pub fn new() -> Self {
|
||||
@ -34,8 +31,7 @@ impl KubePrometheusConfig {
|
||||
default_rules: true,
|
||||
windows_monitoring: false,
|
||||
alert_manager: true,
|
||||
webhook_service_type: None,
|
||||
webhook_url: None,
|
||||
alert_channel: Vec::new(),
|
||||
grafana: true,
|
||||
node_exporter: false,
|
||||
prometheus: true,
|
||||
@ -49,7 +45,6 @@ impl KubePrometheusConfig {
|
||||
prometheus_operator: true,
|
||||
core_dns: false,
|
||||
kube_scheduler: false,
|
||||
discord_alert_manager_release_name: "discord-alert-manager".into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,28 +4,29 @@ use non_blank_string_rs::NonBlankString;
|
||||
|
||||
use crate::modules::helm::chart::HelmChartScore;
|
||||
|
||||
use super::config::KubePrometheusConfig;
|
||||
use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel};
|
||||
|
||||
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore {
|
||||
let url = if let Some(url) = &config.webhook_url {
|
||||
url.to_string()
|
||||
} else {
|
||||
"None".to_string()
|
||||
};
|
||||
fn get_discord_alert_manager_score(config: &KubePrometheusConfig) -> Option<HelmChartScore> {
|
||||
let (url, name) = config.alert_channel.iter().find_map(|channel| {
|
||||
if let AlertChannel::Discord { webhook_url, name } = channel {
|
||||
Some((webhook_url, name))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})?;
|
||||
|
||||
let values = format!(
|
||||
r#"
|
||||
|
||||
r#"
|
||||
environment:
|
||||
- name: "DISCORD_WEBHOOK"
|
||||
value: "{url}"
|
||||
"#,
|
||||
"#,
|
||||
);
|
||||
|
||||
HelmChartScore {
|
||||
Some(HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||
release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(),
|
||||
chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord")
|
||||
release_name: NonBlankString::from_str(&name).unwrap(),
|
||||
chart_name: NonBlankString::from_str("oci://hub.nationtech.io/library/alertmanager-discord")
|
||||
.unwrap(),
|
||||
chart_version: None,
|
||||
values_overrides: None,
|
||||
@ -33,5 +34,13 @@ environment:
|
||||
create_namespace: true,
|
||||
install_only: true,
|
||||
repository: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore {
|
||||
if let Some(chart) = get_discord_alert_manager_score(config) {
|
||||
chart
|
||||
} else {
|
||||
panic!("Expected discord alert manager helm chart");
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType};
|
||||
use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel};
|
||||
use log::info;
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use url::Url;
|
||||
|
||||
use crate::modules::helm::chart::HelmChartScore;
|
||||
|
||||
@ -10,14 +12,6 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh
|
||||
let default_rules = config.default_rules.to_string();
|
||||
let windows_monitoring = config.windows_monitoring.to_string();
|
||||
let alert_manager = config.alert_manager.to_string();
|
||||
let webhook_service_type = if let Some(service) = &config.webhook_service_type {
|
||||
match service {
|
||||
WebhookServiceType::Discord => "Discord".to_string(),
|
||||
WebhookServiceType::Slack => "Slack".to_string(),
|
||||
}
|
||||
} else {
|
||||
"None".to_string()
|
||||
};
|
||||
let grafana = config.grafana.to_string();
|
||||
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
|
||||
let kubernetes_api_server = config.kubernetes_api_server.to_string();
|
||||
@ -31,8 +25,7 @@ pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmCh
|
||||
let node_exporter = config.node_exporter.to_string();
|
||||
let prometheus_operator = config.prometheus_operator.to_string();
|
||||
let prometheus = config.prometheus.to_string();
|
||||
let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string();
|
||||
let values = format!(
|
||||
let mut values = format!(
|
||||
r#"
|
||||
additionalPrometheusRulesMap:
|
||||
pods-status-alerts:
|
||||
@ -45,23 +38,23 @@ additionalPrometheusRulesMap:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD not healthy : {{ $labels.pod }}"
|
||||
title: "[CRIT] POD not healthy : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is in a non-ready state!
|
||||
- **Pod**: {{ $labels.pod }}
|
||||
- **Namespace**: {{ $labels.namespace }}
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- alert: "[CRIT] POD crash looping"
|
||||
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: "[CRIT] POD crash looping : {{ $labels.pod }}"
|
||||
title: "[CRIT] POD crash looping : {{{{ $labels.pod }}}}"
|
||||
description: |
|
||||
A POD is drowning in a crash loop!
|
||||
- **Pod**: {{ $labels.pod }}
|
||||
- **Namespace**: {{ $labels.namespace }}
|
||||
- **Instance**: {{ $labels.instance }}
|
||||
- **Pod**: {{{{ $labels.pod }}}}
|
||||
- **Namespace**: {{{{ $labels.namespace }}}}
|
||||
- **Instance**: {{{{ $labels.instance }}}}
|
||||
pvc-alerts:
|
||||
groups:
|
||||
- name: pvc-alerts
|
||||
@ -82,8 +75,8 @@ additionalPrometheusRulesMap:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.
|
||||
title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days
|
||||
description: The PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} is predicted to fill over 95% in less than 2 days.
|
||||
title: PVC {{{{ $labels.persistentvolumeclaim }}}} in namespace {{{{ $labels.namespace }}}} will fill over 95% in less than 2 days
|
||||
defaultRules:
|
||||
create: {default_rules}
|
||||
rules:
|
||||
@ -123,26 +116,6 @@ defaultRules:
|
||||
windows: true
|
||||
windowsMonitoring:
|
||||
enabled: {windows_monitoring}
|
||||
alertmanager:
|
||||
enabled: {alert_manager}
|
||||
config:
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: '{webhook_service_type}'
|
||||
routes:
|
||||
- receiver: 'null'
|
||||
matchers:
|
||||
- alertname="Watchdog"
|
||||
continue: false
|
||||
receivers:
|
||||
- name: 'null'
|
||||
- name: '{webhook_service_type}'
|
||||
webhook_configs:
|
||||
- url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094'
|
||||
send_resolved: true
|
||||
grafana:
|
||||
enabled: {grafana}
|
||||
kubernetesServiceMonitors:
|
||||
@ -172,6 +145,66 @@ prometheus:
|
||||
"#,
|
||||
);
|
||||
|
||||
let alertmanager_config = alert_manager_yaml_builder(&config);
|
||||
values.push_str(&alertmanager_config);
|
||||
|
||||
fn alert_manager_yaml_builder(config: &KubePrometheusConfig) -> String {
|
||||
let mut receivers = String::new();
|
||||
let mut routes = String::new();
|
||||
let mut global_configs = String::new();
|
||||
let alert_manager = config.alert_manager;
|
||||
for alert_channel in &config.alert_channel {
|
||||
match alert_channel {
|
||||
AlertChannel::Discord { name, .. } => {
|
||||
let (receiver, route) = discord_alert_builder(name);
|
||||
info!("discord receiver: {} \nroute: {}", receiver, route);
|
||||
receivers.push_str(&receiver);
|
||||
routes.push_str(&route);
|
||||
}
|
||||
AlertChannel::Slack {
|
||||
slack_channel,
|
||||
webhook_url,
|
||||
} => {
|
||||
let (receiver, route) = slack_alert_builder(slack_channel);
|
||||
info!("slack receiver: {} \nroute: {}", receiver, route);
|
||||
receivers.push_str(&receiver);
|
||||
|
||||
routes.push_str(&route);
|
||||
let global_config = format!(
|
||||
r#"
|
||||
global:
|
||||
slack_api_url: {webhook_url}"#
|
||||
);
|
||||
|
||||
global_configs.push_str(&global_config);
|
||||
}
|
||||
AlertChannel::Smpt { .. } => todo!(),
|
||||
}
|
||||
}
|
||||
info!("after alert receiver: {}", receivers);
|
||||
info!("after alert routes: {}", routes);
|
||||
|
||||
let alertmanager_config = format!(
|
||||
r#"
|
||||
alertmanager:
|
||||
enabled: {alert_manager}
|
||||
config: {global_configs}
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
routes:
|
||||
{routes}
|
||||
receivers:
|
||||
- name: 'null'
|
||||
{receivers}"#
|
||||
);
|
||||
|
||||
info!("alert manager config: {}", alertmanager_config);
|
||||
alertmanager_config
|
||||
}
|
||||
|
||||
HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
|
||||
@ -187,3 +220,43 @@ prometheus:
|
||||
repository: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn discord_alert_builder(release_name: &String) -> (String, String) {
|
||||
let discord_receiver_name = format!("Discord-{}", release_name);
|
||||
let receiver = format!(
|
||||
r#"
|
||||
- name: '{discord_receiver_name}'
|
||||
webhook_configs:
|
||||
- url: 'http://{release_name}-alertmanager-discord:9094'
|
||||
send_resolved: true"#,
|
||||
);
|
||||
let route = format!(
|
||||
r#"
|
||||
- receiver: '{discord_receiver_name}'
|
||||
matchers:
|
||||
- alertname!=Watchdog
|
||||
continue: true"#,
|
||||
);
|
||||
(receiver, route)
|
||||
}
|
||||
|
||||
fn slack_alert_builder(slack_channel: &String) -> (String, String) {
|
||||
let slack_receiver_name = format!("Slack-{}", slack_channel);
|
||||
let receiver = format!(
|
||||
r#"
|
||||
- name: '{slack_receiver_name}'
|
||||
slack_configs:
|
||||
- channel: '{slack_channel}'
|
||||
send_resolved: true
|
||||
title: '{{{{ .CommonAnnotations.title }}}}'
|
||||
text: '{{{{ .CommonAnnotations.description }}}}'"#,
|
||||
);
|
||||
let route = format!(
|
||||
r#"
|
||||
- receiver: '{slack_receiver_name}'
|
||||
matchers:
|
||||
- alertname!=Watchdog
|
||||
continue: true"#,
|
||||
);
|
||||
(receiver, route)
|
||||
}
|
||||
|
@ -20,9 +20,13 @@ use super::{
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub enum AlertChannel {
|
||||
WebHookUrl {
|
||||
url: Url,
|
||||
webhook_service_type: WebhookServiceType,
|
||||
Discord {
|
||||
name: String,
|
||||
webhook_url: Url,
|
||||
},
|
||||
Slack {
|
||||
slack_channel: String,
|
||||
webhook_url: Url,
|
||||
},
|
||||
//TODO test and implement in helm chart
|
||||
//currently does not work
|
||||
@ -32,46 +36,19 @@ pub enum AlertChannel {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub enum WebhookServiceType {
|
||||
Discord,
|
||||
//TODO test slack notifications
|
||||
Slack,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct MonitoringAlertingStackScore {
|
||||
pub alert_channel: Option<AlertChannel>,
|
||||
pub alert_channel: Vec<AlertChannel>,
|
||||
pub namespace: Option<String>,
|
||||
}
|
||||
|
||||
impl MonitoringAlertingStackScore {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
alert_channel: None,
|
||||
alert_channel: Vec::new(),
|
||||
namespace: None,
|
||||
}
|
||||
}
|
||||
fn set_alert_channel(&self, config: &mut KubePrometheusConfig) {
|
||||
if let Some(alert_channel) = &self.alert_channel {
|
||||
match alert_channel {
|
||||
AlertChannel::WebHookUrl {
|
||||
url,
|
||||
webhook_service_type,
|
||||
} => {
|
||||
config.webhook_url = Some(url.clone());
|
||||
config.webhook_service_type = Some(webhook_service_type.clone());
|
||||
}
|
||||
AlertChannel::Smpt {
|
||||
//TODO setup smpt alerts
|
||||
email_address,
|
||||
service_name,
|
||||
} => {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + HelmCommand> Score<T> for MonitoringAlertingStackScore {
|
||||
@ -93,10 +70,10 @@ struct MonitoringAlertingStackInterpret {
|
||||
impl MonitoringAlertingStackInterpret {
|
||||
async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig {
|
||||
let mut config = KubePrometheusConfig::new();
|
||||
self.score.set_alert_channel(&mut config);
|
||||
if let Some(ns) = &self.score.namespace {
|
||||
config.namespace = ns.clone();
|
||||
}
|
||||
config.alert_channel = self.score.alert_channel.clone();
|
||||
config
|
||||
}
|
||||
|
||||
@ -119,26 +96,30 @@ impl MonitoringAlertingStackInterpret {
|
||||
topology: &T,
|
||||
config: &KubePrometheusConfig,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
match &self.score.alert_channel {
|
||||
Some(AlertChannel::WebHookUrl {
|
||||
webhook_service_type,
|
||||
..
|
||||
}) => match webhook_service_type {
|
||||
WebhookServiceType::Discord => {
|
||||
let mut outcomes = vec![];
|
||||
|
||||
for channel in &self.score.alert_channel {
|
||||
let outcome = match channel {
|
||||
AlertChannel::Discord { .. } => {
|
||||
discord_alert_manager_score(config)
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await
|
||||
}
|
||||
WebhookServiceType::Slack => Ok(Outcome::success(
|
||||
AlertChannel::Slack { .. } => Ok(Outcome::success(
|
||||
"No extra configs for slack alerting".to_string(),
|
||||
)),
|
||||
},
|
||||
Some(AlertChannel::Smpt { .. }) => {
|
||||
todo!()
|
||||
}
|
||||
None => Ok(Outcome::success("No alert channel configured".to_string())),
|
||||
AlertChannel::Smpt { .. } => {
|
||||
todo!()
|
||||
}
|
||||
};
|
||||
outcomes.push(outcome);
|
||||
}
|
||||
for result in outcomes {
|
||||
result?;
|
||||
}
|
||||
|
||||
Ok(Outcome::success("All alert channels deployed".to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,7 +136,8 @@ impl<T: Topology + HelmCommand> Interpret<T> for MonitoringAlertingStackInterpre
|
||||
self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config)
|
||||
.await?;
|
||||
info!("Installing alert channel service");
|
||||
self.deploy_alert_channel_service(inventory, topology, &config).await?;
|
||||
self.deploy_alert_channel_service(inventory, topology, &config)
|
||||
.await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"succesfully deployed monitoring and alerting stack"
|
||||
)))
|
||||
|
Loading…
Reference in New Issue
Block a user