Compare commits

..

No commits in common. "7fc2b1ebfe3774faa1a318a5ac2cf0b1c3dec983" and "f7d3da3ac9688371be619c4a094d5f57b45dd908" have entirely different histories.

6 changed files with 89 additions and 335 deletions

View File

@ -3,10 +3,10 @@ use harmony::{
inventory::Inventory, inventory::Inventory,
maestro::Maestro, maestro::Maestro,
modules::{ modules::{
{
lamp::{LAMPConfig, LAMPScore}, lamp::{LAMPConfig, LAMPScore},
monitoring::monitoring_alerting::{ },
AlertChannel, MonitoringAlertingStackScore, WebhookServiceType, monitoring::monitoring_alerting::MonitoringAlertingStackScore,
},
}, },
topology::{K8sAnywhereTopology, Url}, topology::{K8sAnywhereTopology, Url},
}; };
@ -45,15 +45,8 @@ async fn main() {
.await .await
.unwrap(); .unwrap();
let url = url::Url::parse("https://discord.com/api/webhooks/dummy_channel/dummy_token")
.expect("invalid URL");
let mut monitoring_stack_score = MonitoringAlertingStackScore::new(); let mut monitoring_stack_score = MonitoringAlertingStackScore::new();
monitoring_stack_score.namespace = Some(lamp_stack.config.namespace.clone()); monitoring_stack_score.namespace = Some(lamp_stack.config.namespace.clone());
monitoring_stack_score.alert_channel = Some(AlertChannel::WebHookUrl {
url: url,
webhook_service_type: WebhookServiceType::Discord,
});
maestro.register_all(vec![Box::new(lamp_stack), Box::new(monitoring_stack_score)]); maestro.register_all(vec![Box::new(lamp_stack), Box::new(monitoring_stack_score)]);
// Here we bootstrap the CLI, this gives some nice features if you need them // Here we bootstrap the CLI, this gives some nice features if you need them

View File

@ -2,54 +2,46 @@ use email_address::EmailAddress;
use serde::Serialize; use serde::Serialize;
use url::Url; use url::Url;
use super::monitoring_alerting::WebhookServiceType;
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub struct KubePrometheusConfig { pub struct KubePrometheusConfig {
pub namespace: String, pub namespace: String,
pub default_rules: bool,
pub windows_monitoring: bool,
pub alert_manager: bool,
pub node_exporter: bool, pub node_exporter: bool,
pub alert_manager: bool,
pub prometheus: bool, pub prometheus: bool,
pub grafana: bool, pub grafana: bool,
pub windows_monitoring: bool,
pub kubernetes_service_monitors: bool, pub kubernetes_service_monitors: bool,
pub kubernetes_api_server: bool,
pub kubelet: bool, pub kubelet: bool,
pub kube_controller_manager: bool, pub kube_controller_manager: bool,
pub core_dns: bool,
pub kube_etcd: bool, pub kube_etcd: bool,
pub kube_scheduler: bool,
pub kube_proxy: bool, pub kube_proxy: bool,
pub kube_state_metrics: bool, pub kube_state_metrics: bool,
pub prometheus_operator: bool, pub prometheus_operator: bool,
pub webhook_url: Option<Url>, pub webhook_url: Option<Url>,
pub webhook_service_type: Option<WebhookServiceType>, pub webhook_service_name: Option<String>,
pub discord_alert_manager_release_name: String, pub smpt_email_address: Option<EmailAddress>,
pub smtp_service_name: Option<String>,
} }
impl KubePrometheusConfig { impl KubePrometheusConfig {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
namespace: "monitoring".into(), namespace: "monitoring".into(),
default_rules: true,
windows_monitoring: false,
alert_manager: true,
webhook_service_type: None,
webhook_url: None,
grafana: true,
node_exporter: false, node_exporter: false,
alert_manager: false,
prometheus: true, prometheus: true,
grafana: true,
windows_monitoring: false,
kubernetes_service_monitors: true, kubernetes_service_monitors: true,
kubernetes_api_server: false, kubelet: true,
kubelet: false, kube_controller_manager: true,
kube_controller_manager: false, kube_etcd: true,
kube_etcd: false, kube_proxy: true,
kube_proxy: false,
kube_state_metrics: true, kube_state_metrics: true,
prometheus_operator: true, prometheus_operator: true,
core_dns: false, webhook_url: None,
kube_scheduler: false, webhook_service_name: None,
discord_alert_manager_release_name: "discord-alert-manager".into(), smpt_email_address: None,
smtp_service_name: None,
} }
} }
} }

View File

@ -1,37 +0,0 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
use super::config::KubePrometheusConfig;
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore {
let url = if let Some(url) = &config.webhook_url {
url.to_string()
} else {
"None".to_string()
};
let values = format!(
r#"
environment:
- name: "DISCORD_WEBHOOK"
value: "{url}"
"#,
);
HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
release_name: NonBlankString::from_str(&config.discord_alert_manager_release_name).unwrap(),
chart_name: NonBlankString::from_str("oci://hub.nationtech.io/nt/alertmanager-discord")
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: Some(values.to_string()),
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -1,4 +1,4 @@
use super::{config::KubePrometheusConfig, monitoring_alerting::WebhookServiceType}; use super::config::KubePrometheusConfig;
use non_blank_string_rs::NonBlankString; use non_blank_string_rs::NonBlankString;
use std::{collections::HashMap, str::FromStr}; use std::{collections::HashMap, str::FromStr};
@ -7,61 +7,8 @@ use crate::modules::helm::chart::HelmChartScore;
pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmChartScore { pub fn kube_prometheus_helm_chart_score(config: &KubePrometheusConfig) -> HelmChartScore {
//TODO this should be make into a rule with default formatting that can be easily passed as a vec //TODO this should be make into a rule with default formatting that can be easily passed as a vec
//to the overrides or something leaving the user to deal with formatting here seems bad //to the overrides or something leaving the user to deal with formatting here seems bad
let default_rules = config.default_rules.to_string(); let values = r#"
let windows_monitoring = config.windows_monitoring.to_string();
let alert_manager = config.alert_manager.to_string();
let webhook_service_type = if let Some(service) = &config.webhook_service_type {
match service {
WebhookServiceType::Discord => "Discord".to_string(),
WebhookServiceType::Slack => "Slack".to_string(),
}
} else {
"None".to_string()
};
let grafana = config.grafana.to_string();
let kubernetes_service_monitors = config.kubernetes_service_monitors.to_string();
let kubernetes_api_server = config.kubernetes_api_server.to_string();
let kubelet = config.kubelet.to_string();
let kube_controller_manager = config.kube_controller_manager.to_string();
let core_dns = config.core_dns.to_string();
let kube_etcd = config.kube_etcd.to_string();
let kube_scheduler = config.kube_scheduler.to_string();
let kube_proxy = config.kube_proxy.to_string();
let kube_state_metrics = config.kube_state_metrics.to_string();
let node_exporter = config.node_exporter.to_string();
let prometheus_operator = config.prometheus_operator.to_string();
let prometheus = config.prometheus.to_string();
let discord_alert_manager_release_name = config.discord_alert_manager_release_name.to_string();
let values = format!(
r#"
additionalPrometheusRulesMap: additionalPrometheusRulesMap:
pods-status-alerts:
groups:
- name: pods
rules:
- alert: "[CRIT] POD not healthy"
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{{phase=~"Pending|Unknown|Failed"}})[15m:1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: "[CRIT] POD not healthy : {{ $labels.pod }}"
description: |
A POD is in a non-ready state!
- **Pod**: {{ $labels.pod }}
- **Namespace**: {{ $labels.namespace }}
- alert: "[CRIT] POD crash looping"
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 0m
labels:
severity: critical
annotations:
title: "[CRIT] POD crash looping : {{ $labels.pod }}"
description: |
A POD is drowning in a crash loop!
- **Pod**: {{ $labels.pod }}
- **Namespace**: {{ $labels.namespace }}
- **Instance**: {{ $labels.instance }}
pvc-alerts: pvc-alerts:
groups: groups:
- name: pvc-alerts - name: pvc-alerts
@ -84,103 +31,49 @@ additionalPrometheusRulesMap:
annotations: annotations:
description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days. description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.
title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days
defaultRules: "#;
create: {default_rules} let mut values_overrides: HashMap<NonBlankString, String> = HashMap::new();
rules:
alertmanager: true macro_rules! insert_flag {
etcd: true ($key:expr, $val:expr) => {
configReloaders: true values_overrides.insert(NonBlankString::from_str($key).unwrap(), $val.to_string());
general: true };
k8sContainerCpuUsageSecondsTotal: true }
k8sContainerMemoryCache: true
k8sContainerMemoryRss: true insert_flag!("nodeExporter.enabled", config.node_exporter);
k8sContainerMemorySwap: true insert_flag!("windowsMonitoring.enabled", config.windows_monitoring);
k8sContainerResource: true insert_flag!("grafana.enabled", config.grafana);
k8sContainerMemoryWorkingSetBytes: true insert_flag!("alertmanager.enabled", config.alert_manager);
k8sPodOwner: true insert_flag!("prometheus.enabled", config.prometheus);
kubeApiserverAvailability: true insert_flag!(
kubeApiserverBurnrate: true "kubernetes_service_monitors.enabled",
kubeApiserverHistogram: true config.kubernetes_service_monitors
kubeApiserverSlos: true
kubeControllerManager: true
kubelet: true
kubeProxy: true
kubePrometheusGeneral: true
kubePrometheusNodeRecording: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
kubeSchedulerAlerting: true
kubeSchedulerRecording: true
kubeStateMetrics: true
network: true
node: true
nodeExporterAlerting: true
nodeExporterRecording: true
prometheus: true
prometheusOperator: true
windows: true
windowsMonitoring:
enabled: {windows_monitoring}
alertmanager:
enabled: {alert_manager}
config:
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: '{webhook_service_type}'
routes:
- receiver: 'null'
matchers:
- alertname="Watchdog"
continue: false
receivers:
- name: 'null'
- name: '{webhook_service_type}'
webhook_configs:
- url: 'http://{discord_alert_manager_release_name}-alertmanager-discord:9094'
send_resolved: true
grafana:
enabled: {grafana}
kubernetesServiceMonitors:
enabled: {kubernetes_service_monitors}
kubeApiServer:
enabled: {kubernetes_api_server}
kubelet:
enabled: {kubelet}
kubeControllerManager:
enabled: {kube_controller_manager}
coreDns:
enabled: {core_dns}
kubeEtcd:
enabled: {kube_etcd}
kubeScheduler:
enabled: {kube_scheduler}
kubeProxy:
enabled: {kube_proxy}
kubeStateMetrics:
enabled: {kube_state_metrics}
nodeExporter:
enabled: {node_exporter}
prometheusOperator:
enabled: {prometheus_operator}
prometheus:
enabled: {prometheus}
"#,
); );
insert_flag!("kubelet.enabled", config.kubelet);
insert_flag!(
"kubeControllerManager.enabled",
config.kube_controller_manager
);
insert_flag!("kubeProxy.enabled", config.kube_proxy);
insert_flag!("kubeEtcd.enabled", config.kube_etcd);
insert_flag!("kubeStateMetrics.enabled", config.kube_state_metrics);
insert_flag!("prometheusOperator.enabled", config.prometheus_operator);
if let (Some(url), Some(name)) = (&config.webhook_url, &config.webhook_service_name) {
insert_flag!("alertmanager.config.receivers.webhook_configs.url", url.as_str());
insert_flag!("alertmanager.config.receivers.name", name.as_str());
}
HelmChartScore { HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()), namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(), release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
chart_name: NonBlankString::from_str( chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", "oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", //use kube prometheus chart which includes grafana, prometheus, alert
//manager, etc
) )
.unwrap(), .unwrap(),
chart_version: None, chart_version: None,
values_overrides: None, values_overrides: Some(values_overrides),
values_yaml: Some(values.to_string()), values_yaml: Some(values.to_string()),
create_namespace: true, create_namespace: true,
install_only: true, install_only: true,

View File

@ -1,4 +1,3 @@
mod kube_prometheus; mod kube_prometheus;
pub mod monitoring_alerting; pub mod monitoring_alerting;
mod discord_alert_manager;
mod config; mod config;

View File

@ -1,31 +1,22 @@
use async_trait::async_trait;
use email_address::EmailAddress; use email_address::EmailAddress;
use log::info;
use serde::Serialize; use serde::Serialize;
use url::Url; use url::Url;
use crate::{ use crate::{
data::{Id, Version}, interpret::Interpret,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score, score::Score,
topology::{HelmCommand, Topology}, topology::{HelmCommand, Topology},
}; };
use super::{ use super::{config::KubePrometheusConfig, kube_prometheus::kube_prometheus_helm_chart_score};
config::KubePrometheusConfig, discord_alert_manager::discord_alert_manager_score,
kube_prometheus::kube_prometheus_helm_chart_score,
};
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub enum AlertChannel { pub enum AlertChannel {
WebHookUrl { WebHookUrl {
url: Url, url: Url,
webhook_service_type: WebhookServiceType, service_name: String,
}, },
//TODO test and implement in helm chart
//currently does not work
Smpt { Smpt {
email_address: EmailAddress, email_address: EmailAddress,
service_name: String, service_name: String,
@ -33,15 +24,15 @@ pub enum AlertChannel {
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub enum WebhookServiceType { pub enum Stack {
Discord, KubePrometheusStack,
//TODO test slack notifications OtherStack,
Slack,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
pub struct MonitoringAlertingStackScore { pub struct MonitoringAlertingStackScore {
pub alert_channel: Option<AlertChannel>, pub alert_channel: Option<AlertChannel>,
pub monitoring_stack: Stack,
pub namespace: Option<String>, pub namespace: Option<String>,
} }
@ -49,131 +40,54 @@ impl MonitoringAlertingStackScore {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
alert_channel: None, alert_channel: None,
monitoring_stack: Stack::KubePrometheusStack,
namespace: None, namespace: None,
} }
} }
fn set_alert_channel(&self, config: &mut KubePrometheusConfig) { fn match_alert_channel(&self, config: &mut KubePrometheusConfig) {
if let Some(alert_channel) = &self.alert_channel { if let Some(alert_channel) = &self.alert_channel {
match alert_channel { match alert_channel {
AlertChannel::WebHookUrl { //opt1
url, AlertChannel::WebHookUrl { url, service_name } => {
webhook_service_type,
} => {
config.webhook_url = Some(url.clone()); config.webhook_url = Some(url.clone());
config.webhook_service_type = Some(webhook_service_type.clone()); config.webhook_service_name = Some(service_name.clone());
} }
//opt2
AlertChannel::Smpt { AlertChannel::Smpt {
//TODO setup smpt alerts
email_address, email_address,
service_name, service_name,
} => { } => {
todo!() config.smpt_email_address = Some(email_address.clone());
config.smtp_service_name = Some(service_name.clone());
} }
} }
} }
} }
fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig {
let mut config = KubePrometheusConfig::new();
self.match_alert_channel(&mut config);
if let Some(ns) = &self.namespace {
config.namespace = ns.clone();
}
config
}
} }
impl<T: Topology + HelmCommand> Score<T> for MonitoringAlertingStackScore { impl<T: Topology + HelmCommand> Score<T> for MonitoringAlertingStackScore {
fn create_interpret(&self) -> Box<dyn Interpret<T>> { fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(MonitoringAlertingStackInterpret { match &self.monitoring_stack {
score: self.clone(), Stack::KubePrometheusStack => {
}) let config = self.build_kube_prometheus_helm_chart_config();
let helm_chart = kube_prometheus_helm_chart_score(&config);
helm_chart.create_interpret()
}
Stack::OtherStack => {
todo!()
}
}
} }
fn name(&self) -> String { fn name(&self) -> String {
format!("MonitoringAlertingStackScore") format!("MonitoringAlertingStackScore")
} }
} }
#[derive(Debug, Clone, Serialize)]
struct MonitoringAlertingStackInterpret {
score: MonitoringAlertingStackScore,
}
impl MonitoringAlertingStackInterpret {
async fn build_kube_prometheus_helm_chart_config(&self) -> KubePrometheusConfig {
let mut config = KubePrometheusConfig::new();
self.score.set_alert_channel(&mut config);
if let Some(ns) = &self.score.namespace {
config.namespace = ns.clone();
}
config
}
async fn deploy_kube_prometheus_helm_chart_score<T: Topology + HelmCommand>(
&self,
inventory: &Inventory,
topology: &T,
config: &KubePrometheusConfig,
) -> Result<Outcome, InterpretError> {
let helm_chart = kube_prometheus_helm_chart_score(config);
helm_chart
.create_interpret()
.execute(inventory, topology)
.await
}
async fn deploy_alert_channel_service<T: Topology + HelmCommand>(
&self,
inventory: &Inventory,
topology: &T,
config: &KubePrometheusConfig,
) -> Result<Outcome, InterpretError> {
match &self.score.alert_channel {
Some(AlertChannel::WebHookUrl {
webhook_service_type,
..
}) => match webhook_service_type {
WebhookServiceType::Discord => {
discord_alert_manager_score(config)
.create_interpret()
.execute(inventory, topology)
.await
}
WebhookServiceType::Slack => Ok(Outcome::success(
"No extra configs for slack alerting".to_string(),
)),
},
Some(AlertChannel::Smpt { .. }) => {
todo!()
}
None => Ok(Outcome::success("No alert channel configured".to_string())),
}
}
}
#[async_trait]
impl<T: Topology + HelmCommand> Interpret<T> for MonitoringAlertingStackInterpret {
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let config = self.build_kube_prometheus_helm_chart_config().await;
info!("Built kube prometheus config");
info!("Installing kube prometheus chart");
self.deploy_kube_prometheus_helm_chart_score(inventory, topology, &config)
.await?;
info!("Installing alert channel service");
self.deploy_alert_channel_service(inventory, topology, &config).await?;
Ok(Outcome::success(format!(
"succesfully deployed monitoring and alerting stack"
)))
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}