forked from NationTech/harmony
		
	wip:added alertreceiver and alert rules which are built and added to the yaml before deploying prometheus, added a few dashboards to grafana. Trying to fix prometheus-server clusterrole/role/serviceaccount so that it can discover targets and kubernetes in a namespaced release where it does not have access to clusterrole
This commit is contained in:
		
							parent
							
								
									31661aaaf1
								
							
						
					
					
						commit
						5c628b37b7
					
				| @ -3,9 +3,18 @@ use std::{path::PathBuf, sync::Arc}; | |||||||
| use harmony::{ | use harmony::{ | ||||||
|     inventory::Inventory, |     inventory::Inventory, | ||||||
|     maestro::Maestro, |     maestro::Maestro, | ||||||
|     modules::application::{ |     modules::{ | ||||||
|  |         application::{ | ||||||
|             ApplicationScore, RustWebFramework, RustWebapp, |             ApplicationScore, RustWebFramework, RustWebapp, | ||||||
|         features::{ContinuousDelivery, Monitoring}, |             features::{ContinuousDelivery, PrometheusMonitoring}, | ||||||
|  |         }, | ||||||
|  |         monitoring::{ | ||||||
|  |             alert_channel::discord_alert_channel::DiscordWebhook, | ||||||
|  |             alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, | ||||||
|  |         }, | ||||||
|  |         prometheus::alerts::k8s::{ | ||||||
|  |             pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days, | ||||||
|  |         }, | ||||||
|     }, |     }, | ||||||
|     topology::{K8sAnywhereTopology, Url}, |     topology::{K8sAnywhereTopology, Url}, | ||||||
| }; | }; | ||||||
| @ -20,12 +29,25 @@ async fn main() { | |||||||
|         framework: Some(RustWebFramework::Leptos), |         framework: Some(RustWebFramework::Leptos), | ||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|  |     let pod_failed = pod_in_failed_state(); | ||||||
|  |     let pod_failed_2 = pod_in_failed_state(); | ||||||
|  |     let pod_failed_3 = pod_in_failed_state(); | ||||||
|  | 
 | ||||||
|  |     let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]); | ||||||
|  |     let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]); | ||||||
|     let app = ApplicationScore { |     let app = ApplicationScore { | ||||||
|         features: vec![ |         features: vec![ | ||||||
|             Box::new(ContinuousDelivery { |             //Box::new(ContinuousDelivery {
 | ||||||
|  |             //    application: application.clone(),
 | ||||||
|  |             //}),
 | ||||||
|  |             Box::new(PrometheusMonitoring { | ||||||
|                 application: application.clone(), |                 application: application.clone(), | ||||||
|  |                 alert_receivers: vec![Box::new(DiscordWebhook { | ||||||
|  |                     name: "dummy-discord".to_string(), | ||||||
|  |                     url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), | ||||||
|  |                 })], | ||||||
|  |                 alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)], | ||||||
|             }), |             }), | ||||||
|             Box::new(Monitoring {}), |  | ||||||
|             // TODO add monitoring, backups, multisite ha, etc
 |             // TODO add monitoring, backups, multisite ha, etc
 | ||||||
|         ], |         ], | ||||||
|         application, |         application, | ||||||
|  | |||||||
| @ -1,3 +1,5 @@ | |||||||
|  | use std::any::Any; | ||||||
|  | 
 | ||||||
| use async_trait::async_trait; | use async_trait::async_trait; | ||||||
| use log::debug; | use log::debug; | ||||||
| 
 | 
 | ||||||
| @ -9,7 +11,7 @@ use crate::{ | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| pub trait AlertSender: Send + Sync + std::fmt::Debug { | pub trait AlertSender: Any + Send + Sync + std::fmt::Debug { | ||||||
|     fn name(&self) -> String; |     fn name(&self) -> String; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -9,32 +9,36 @@ use crate::{ | |||||||
|         application::{Application, ApplicationFeature}, |         application::{Application, ApplicationFeature}, | ||||||
|         monitoring::{ |         monitoring::{ | ||||||
|             application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore, |             application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore, | ||||||
|             kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, |             kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus, | ||||||
|         }, |         }, | ||||||
|     }, |     }, | ||||||
|     score::Score, |     score::Score, | ||||||
|     topology::{HelmCommand, Topology, tenant::TenantManager}, |     topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Debug, Clone)] | #[derive(Debug, Clone)] | ||||||
| pub struct Monitoring { | pub struct PrometheusMonitoring { | ||||||
|     pub application: Arc<dyn Application>, |     pub application: Arc<dyn Application>, | ||||||
|  |     pub alert_receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>, | ||||||
|  |     pub alert_rules: Vec<Box<dyn AlertRule<Prometheus>>>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for Monitoring { | impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for PrometheusMonitoring { | ||||||
|     async fn ensure_installed(&self, topology: &T) -> Result<(), String> { |     async fn ensure_installed(&self, topology: &T) -> Result<(), String> { | ||||||
|         info!("Ensuring monitoring is available for application"); |         info!("Ensuring monitoring is available for application"); | ||||||
|         let ns = self.application.name(); |         let ns = self.application.name(); | ||||||
|         let mut service_monitor = ServiceMonitor::default(); |         let mut service_monitor = ServiceMonitor::default(); | ||||||
|  |         service_monitor.name = ns.clone(); | ||||||
|  |         service_monitor.namespace = ns.clone(); | ||||||
|         service_monitor.namespace_selector = Some(NamespaceSelector { |         service_monitor.namespace_selector = Some(NamespaceSelector { | ||||||
|             any: true, |             any: true, | ||||||
|             match_names: vec![ns.clone()], |             match_names: vec![ns.clone()], | ||||||
|         }); |         }); | ||||||
|         let alerting_score = ApplicationPrometheusMonitoringScore { |         let alerting_score = ApplicationPrometheusMonitoringScore { | ||||||
|             namespace: ns, |             namespace: ns, | ||||||
|             receivers: vec![], |             receivers: self.alert_receivers.clone(), | ||||||
|             rules: vec![], |             rules: self.alert_rules.clone(), | ||||||
|             service_monitors: vec![service_monitor], |             service_monitors: vec![service_monitor], | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -220,7 +220,6 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret { | |||||||
|             yaml_path, |             yaml_path, | ||||||
|             Some(&helm_options), |             Some(&helm_options), | ||||||
|         ); |         ); | ||||||
| 
 |  | ||||||
|         let status = match res { |         let status = match res { | ||||||
|             Ok(status) => status, |             Ok(status) => status, | ||||||
|             Err(err) => return Err(InterpretError::new(err.to_string())), |             Err(err) => return Err(InterpretError::new(err.to_string())), | ||||||
|  | |||||||
| @ -1,3 +1,5 @@ | |||||||
|  | use std::any::Any; | ||||||
|  | 
 | ||||||
| use async_trait::async_trait; | use async_trait::async_trait; | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| use serde_yaml::{Mapping, Value}; | use serde_yaml::{Mapping, Value}; | ||||||
| @ -11,7 +13,10 @@ use crate::{ | |||||||
|         }, |         }, | ||||||
|         prometheus::prometheus::{Prometheus, PrometheusReceiver}, |         prometheus::prometheus::{Prometheus, PrometheusReceiver}, | ||||||
|     }, |     }, | ||||||
|     topology::{Url, oberservability::monitoring::AlertReceiver}, |     topology::{ | ||||||
|  |         Url, | ||||||
|  |         oberservability::monitoring::{AlertReceiver, AlertSender}, | ||||||
|  |     }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Debug, Clone, Serialize)] | #[derive(Debug, Clone, Serialize)] | ||||||
|  | |||||||
| @ -6,13 +6,11 @@ use serde::Serialize; | |||||||
| use crate::{ | use crate::{ | ||||||
|     modules::monitoring::{ |     modules::monitoring::{ | ||||||
|         kube_prometheus::types::ServiceMonitor, |         kube_prometheus::types::ServiceMonitor, | ||||||
|         prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig}, |         prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig}, | ||||||
|     }, |     }, | ||||||
|     score::Score, |     score::Score, | ||||||
|     topology::{ |     topology::{ | ||||||
|         HelmCommand, Topology, |         oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology | ||||||
|         oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, |  | ||||||
|         tenant::TenantManager, |  | ||||||
|     }, |     }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| @ -26,7 +24,7 @@ pub struct ApplicationPrometheusMonitoringScore { | |||||||
| 
 | 
 | ||||||
| impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore { | impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore { | ||||||
|     fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> { |     fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> { | ||||||
|         let config = Arc::new(Mutex::new(PrometheusConfig::new())); |         let config = Arc::new(Mutex::new(HelmPrometheusConfig::new())); | ||||||
|         config |         config | ||||||
|             .try_lock() |             .try_lock() | ||||||
|             .expect("couldn't lock config") |             .expect("couldn't lock config") | ||||||
|  | |||||||
| @ -1,6 +1,5 @@ | |||||||
| use non_blank_string_rs::NonBlankString; |  | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| 
 | use non_blank_string_rs::NonBlankString; | ||||||
| use crate::modules::helm::chart::HelmChartScore; | use crate::modules::helm::chart::HelmChartScore; | ||||||
| 
 | 
 | ||||||
| pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { | pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { | ||||||
| @ -8,6 +7,7 @@ pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore { | |||||||
|         r#" |         r#" | ||||||
| rbac: | rbac: | ||||||
|   namespaced: true |   namespaced: true | ||||||
|  | 
 | ||||||
| datasources: | datasources: | ||||||
|   datasources.yaml: |   datasources.yaml: | ||||||
|     apiVersion: 1 |     apiVersion: 1 | ||||||
| @ -17,26 +17,36 @@ datasources: | |||||||
|       access: proxy |       access: proxy | ||||||
|       url: http://prometheus-server.{ns}.svc.cluster.local
 |       url: http://prometheus-server.{ns}.svc.cluster.local
 | ||||||
|       isDefault: true |       isDefault: true | ||||||
| downloadDashboards: |  | ||||||
|   dashboards: |  | ||||||
|     - url: https://raw.githubusercontent.com/grafana/grafana/main/devenv/dev-dashboards/node-exporter-full_rev1.json
 |  | ||||||
|       file: node-exporter-full.json |  | ||||||
| 
 | 
 | ||||||
|     - url: https://grafana.com/api/dashboards/7685/revisions/1/download
 | dashboardProviders: | ||||||
|       file: kubernetes-pvs-usage.json |   dashboardproviders.yaml: | ||||||
|  |     apiVersion: 1 | ||||||
|  |     providers: | ||||||
|  |     - name: 'default' | ||||||
|  |       orgId: 1 | ||||||
|  |       folder: '' | ||||||
|  |       type: file | ||||||
|  |       disableDeletion: false | ||||||
|  |       updateIntervalSeconds: 10 
 | ||||||
|  |       allowUiUpdates: true | ||||||
|  |       editable: true | ||||||
|  |       options: | ||||||
|  |         path: /var/lib/grafana/dashboards/default | ||||||
| 
 | 
 | ||||||
|     # Namespace resource usage vs quotas | dashboards: | ||||||
|     - url: https://grafana.com/api/dashboards/17044/revisions/1/download
 |   default: | ||||||
|       file: namespace-resources-vs-quotas.json |     compute-usage: | ||||||
| 
 |       url: https://grafana.com/api/dashboards/315/revisions/1/download
 | ||||||
|     # Kubernetes namespace resources (CPU, RAM, network) |     pod-health: | ||||||
|     - url: https://grafana.com/api/dashboards/9809/revisions/1/download
 |       url: https://grafana.com/api/dashboards/15758/revisions/1/download
 | ||||||
|       file: kubernetes-namespace-resources.json |     namespace-resources: | ||||||
| 
 |       url: https://grafana.com/api/dashboards/9809/revisions/1/download
 | ||||||
|     # Top 10 namespaces by memory usage |     namespace-resources-vs-quotas: | ||||||
|     - url: https://grafana.com/api/dashboards/10678/revisions/1/download
 |       url: https://grafana.com/api/dashboards/17044/revisions/1/download
 | ||||||
|       file: top10-namespace-memory.json |     persistent-volume-usage: | ||||||
|         "#
 |       url: https://grafana.com/api/dashboards/7685/revisions/1/download
 | ||||||
|  | "#,
 | ||||||
|  |         ns = ns | ||||||
|     ); |     ); | ||||||
| 
 | 
 | ||||||
|     HelmChartScore { |     HelmChartScore { | ||||||
| @ -45,9 +55,10 @@ downloadDashboards: | |||||||
|         chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(), |         chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(), | ||||||
|         chart_version: None, |         chart_version: None, | ||||||
|         values_overrides: None, |         values_overrides: None, | ||||||
|         values_yaml: Some(values.to_string()), |         values_yaml: Some(values), | ||||||
|         create_namespace: true, |         create_namespace: true, | ||||||
|         install_only: false, |         install_only: false, | ||||||
|         repository: None, |         repository: None, | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | |||||||
| @ -211,6 +211,8 @@ pub struct Selector { | |||||||
| pub struct ServiceMonitor { | pub struct ServiceMonitor { | ||||||
|     pub name: String, |     pub name: String, | ||||||
| 
 | 
 | ||||||
|  |     pub namespace: String, | ||||||
|  | 
 | ||||||
|     // # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
 |     // # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
 | ||||||
|     pub additional_labels: Option<HashMap<String, String>>, |     pub additional_labels: Option<HashMap<String, String>>, | ||||||
| 
 | 
 | ||||||
| @ -261,6 +263,7 @@ impl Default for ServiceMonitor { | |||||||
|     fn default() -> Self { |     fn default() -> Self { | ||||||
|         Self { |         Self { | ||||||
|             name: Default::default(), |             name: Default::default(), | ||||||
|  |             namespace: Default::default(), | ||||||
|             additional_labels: Default::default(), |             additional_labels: Default::default(), | ||||||
|             job_label: Default::default(), |             job_label: Default::default(), | ||||||
|             target_labels: Default::default(), |             target_labels: Default::default(), | ||||||
|  | |||||||
| @ -1 +1,2 @@ | |||||||
| pub mod prometheus_helm; | pub mod prometheus_helm; | ||||||
|  | pub mod types; | ||||||
|  | |||||||
| @ -1,56 +1,145 @@ | |||||||
|  | use std::collections::BTreeMap; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| use std::sync::{Arc, Mutex}; | use std::sync::{Arc, Mutex}; | ||||||
| 
 | 
 | ||||||
|  | use log::debug; | ||||||
| use non_blank_string_rs::NonBlankString; | use non_blank_string_rs::NonBlankString; | ||||||
|  | use serde_yaml::{Mapping, Value}; | ||||||
| 
 | 
 | ||||||
| use crate::modules::{ | use crate::modules::helm::chart::HelmChartScore; | ||||||
|     helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig, | use crate::modules::monitoring::kube_prometheus::types::{ | ||||||
|  |     AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec, | ||||||
|  |     ConfigReloader, Limits, Requests, Resources, | ||||||
| }; | }; | ||||||
|  | use crate::modules::monitoring::prometheus::helm::types::{ | ||||||
|  |     AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor, | ||||||
|  |     Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig, | ||||||
|  | }; | ||||||
|  | use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig; | ||||||
| 
 | 
 | ||||||
| pub fn prometheus_helm_chart_score(config: Arc<Mutex<PrometheusConfig>>) -> HelmChartScore { | pub fn prometheus_helm_chart_score(config: Arc<Mutex<HelmPrometheusConfig>>) -> HelmChartScore { | ||||||
|     let config = config.lock().unwrap(); |     let config = config.lock().unwrap(); | ||||||
|     let ns = config.namespace.clone().unwrap(); |     let ns = config.namespace.clone().unwrap(); | ||||||
|     let values = format!( |  | ||||||
|         r#" |  | ||||||
| rbac: |  | ||||||
|   create: false |  | ||||||
| kube-state-metrics: |  | ||||||
|   enabled: false |  | ||||||
| prometheus-node-exporter: |  | ||||||
|   enabled: false |  | ||||||
| prometheus-pushgateway: |  | ||||||
|   enabled: false |  | ||||||
| 
 | 
 | ||||||
| server: |     let rbac_config = RbacConfig { create: false }; | ||||||
|   releaseNamespace: true |  | ||||||
|   clusterRole: false |  | ||||||
|   clusterRoleBinding: false |  | ||||||
|   rbac: |  | ||||||
|     create: true |  | ||||||
|     namespaced: true |  | ||||||
| 
 | 
 | ||||||
|   serverFiles: |     let ksm_config = KubeStateMetricsConfig { | ||||||
|     prometheus.yml: |         enabled: true, | ||||||
|       scrape_configs: |         rbac: KsmRbacConfig { | ||||||
|         - job_name: 'prometheus' |             use_cluster_role: false, | ||||||
|           static_configs: |         }, | ||||||
|             - targets: ['localhost:9090'] |         prometheus: Prometheus { | ||||||
|   serviceMonitorNamespaceSelector: |             monitor: Monitor { enabled: true }, | ||||||
|     matchLabels: |         }, | ||||||
|       kubernetes.io/metadata.name: {ns} |     }; | ||||||
|   podMonitorNamespaceSelector: |  | ||||||
|     matchLabels: |  | ||||||
|       kubernetes.io/metadata.name: {ns} |  | ||||||
| 
 | 
 | ||||||
| alertmanager: |     let mut selector_labels = BTreeMap::new(); | ||||||
|   enabled: true |     selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone()); | ||||||
|   rbac: |     let mut kube_state_metrics_labels = BTreeMap::new(); | ||||||
|     create: true |     kube_state_metrics_labels.insert( | ||||||
|     namespaced: true |         "app.kubernetes.io/name".to_string(), | ||||||
| "#
 |         "kube-state-metrics".to_string(), | ||||||
|     ); |     ); | ||||||
|  |     let selector = LabelSelector { | ||||||
|  |         match_labels: selector_labels, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     let server_config = ServerConfig { | ||||||
|  |         namespaces: vec![ns.clone()], | ||||||
|  |         use_existing_cluster_role_name: false, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     let mut null_receiver = Mapping::new(); | ||||||
|  |     null_receiver.insert( | ||||||
|  |         Value::String("receiver".to_string()), | ||||||
|  |         Value::String("default-receiver".to_string()), | ||||||
|  |     ); | ||||||
|  |     null_receiver.insert( | ||||||
|  |         Value::String("matchers".to_string()), | ||||||
|  |         Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), | ||||||
|  |     ); | ||||||
|  |     null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); | ||||||
|  | 
 | ||||||
|  |     let mut alert_manager_channel_config = AlertManagerConfig { | ||||||
|  |         global: Mapping::new(), | ||||||
|  |         route: AlertManagerRoute { | ||||||
|  |             routes: vec![Value::Mapping(null_receiver)], | ||||||
|  |         }, | ||||||
|  |         receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()], | ||||||
|  |     }; | ||||||
|  |     for receiver in config.alert_receiver_configs.iter() { | ||||||
|  |         if let Some(global) = receiver.channel_global_config.clone() { | ||||||
|  |             alert_manager_channel_config | ||||||
|  |                 .global | ||||||
|  |                 .insert(global.0, global.1); | ||||||
|  |         } | ||||||
|  |         alert_manager_channel_config | ||||||
|  |             .route | ||||||
|  |             .routes | ||||||
|  |             .push(receiver.channel_route.clone()); | ||||||
|  |         alert_manager_channel_config | ||||||
|  |             .receivers | ||||||
|  |             .push(receiver.channel_receiver.clone()); | ||||||
|  |     } | ||||||
|  |     let alert_manager_values = AlertManager { | ||||||
|  |         enabled: config.alert_manager, | ||||||
|  |         config: alert_manager_channel_config, | ||||||
|  |         alertmanager_spec: AlertManagerSpec { | ||||||
|  |             resources: Resources { | ||||||
|  |                 limits: Limits { | ||||||
|  |                     memory: "100Mi".to_string(), | ||||||
|  |                     cpu: "100m".to_string(), | ||||||
|  |                 }, | ||||||
|  |                 requests: Requests { | ||||||
|  |                     memory: "100Mi".to_string(), | ||||||
|  |                     cpu: "100m".to_string(), | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         init_config_reloader: ConfigReloader { | ||||||
|  |             resources: Resources { | ||||||
|  |                 limits: Limits { | ||||||
|  |                     memory: "100Mi".to_string(), | ||||||
|  |                     cpu: "100m".to_string(), | ||||||
|  |                 }, | ||||||
|  |                 requests: Requests { | ||||||
|  |                     memory: "100Mi".to_string(), | ||||||
|  |                     cpu: "100m".to_string(), | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     let mut result: BTreeMap<String, AlertFile> = BTreeMap::new(); | ||||||
|  |     for rule in config.alert_rules.clone().iter() { | ||||||
|  |         for (name, group) in &rule.rules { | ||||||
|  |             result | ||||||
|  |                 .entry("alerting_rules.yml".to_string()) | ||||||
|  |                 .and_modify(|e| e.groups.extend(group.groups.clone())) | ||||||
|  |                 .or_insert(AlertFile { | ||||||
|  |                     groups: group.groups.clone(), | ||||||
|  |                 }); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     let final_values = PrometheusHelmValues { | ||||||
|  |         rbac: rbac_config, | ||||||
|  |         kube_state_metrics: ksm_config, | ||||||
|  |         server: server_config, | ||||||
|  |         alertmanager: alert_manager_values, | ||||||
|  |         server_files: result, | ||||||
|  |         additional_service_monitors: config.additional_service_monitors.clone(), | ||||||
|  |         prometheus_node_exporter: EnabledConfig { enabled: false }, | ||||||
|  |         prometheus_pushgateway: EnabledConfig { enabled: false }, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     let values_yaml = | ||||||
|  |         serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values"); | ||||||
|  | 
 | ||||||
|  |     debug!("full values.yaml: \n{}", values_yaml); | ||||||
|  | 
 | ||||||
|     HelmChartScore { |     HelmChartScore { | ||||||
|         namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()), |         namespace: Some(NonBlankString::from_str(&ns).unwrap()), | ||||||
|         release_name: NonBlankString::from_str("prometheus").unwrap(), |         release_name: NonBlankString::from_str("prometheus").unwrap(), | ||||||
|         chart_name: NonBlankString::from_str( |         chart_name: NonBlankString::from_str( | ||||||
|             "oci://ghcr.io/prometheus-community/charts/prometheus", |             "oci://ghcr.io/prometheus-community/charts/prometheus", | ||||||
| @ -58,9 +147,9 @@ alertmanager: | |||||||
|         .unwrap(), |         .unwrap(), | ||||||
|         chart_version: None, |         chart_version: None, | ||||||
|         values_overrides: None, |         values_overrides: None, | ||||||
|         values_yaml: Some(values.to_string()), |         values_yaml: Some(values_yaml), | ||||||
|         create_namespace: true, |         create_namespace: true, | ||||||
|         install_only: false, |         install_only: true, | ||||||
|         repository: None, |         repository: None, | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										94
									
								
								harmony/src/modules/monitoring/prometheus/helm/types.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								harmony/src/modules/monitoring/prometheus/helm/types.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,94 @@ | |||||||
|  | use std::collections::BTreeMap; | ||||||
|  | 
 | ||||||
|  | use serde::Serialize; | ||||||
|  | 
 | ||||||
|  | use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{ | ||||||
|  |     AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor | ||||||
|  | }}; | ||||||
|  | 
 | ||||||
|  | #[derive(Debug, Clone, Serialize)] | ||||||
|  | pub struct RuleFilesConfig { | ||||||
|  |     #[serde(rename = "ruleFiles")] | ||||||
|  |     pub files: BTreeMap<String, AlertGroup>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct PrometheusHelmValues { | ||||||
|  |     pub rbac: RbacConfig, | ||||||
|  |     #[serde(rename = "kube-state-metrics")] | ||||||
|  |     pub kube_state_metrics: KubeStateMetricsConfig, | ||||||
|  |     pub server: ServerConfig, | ||||||
|  |     pub alertmanager: AlertManager, // You already have this
 | ||||||
|  |     #[serde(rename = "serverFiles")] | ||||||
|  |     pub server_files: BTreeMap<String, AlertFile>, // You already have this
 | ||||||
|  |     pub additional_service_monitors: Vec<ServiceMonitor>, // You already have this
 | ||||||
|  |     #[serde(rename = "prometheus-node-exporter")] | ||||||
|  |     pub prometheus_node_exporter: EnabledConfig, | ||||||
|  |     #[serde(rename = "prometheus-pushgateway")] | ||||||
|  |     pub prometheus_pushgateway: EnabledConfig, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug, Clone)] | ||||||
|  | pub struct AlertFile { | ||||||
|  |     pub groups: Vec<AlertManagerRuleGroup>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct RbacConfig { | ||||||
|  |     pub create: bool, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct KubeStateMetricsConfig { | ||||||
|  |     pub enabled: bool, | ||||||
|  |     pub rbac: KsmRbacConfig, | ||||||
|  |     pub prometheus: Prometheus, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct Prometheus { | ||||||
|  |     pub monitor: Monitor | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct Monitor{ | ||||||
|  |     pub enabled: bool | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct KsmRbacConfig { | ||||||
|  |     pub use_cluster_role: bool, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct ServerConfig { | ||||||
|  |     pub namespaces: Vec<String>, | ||||||
|  |     pub use_existing_cluster_role_name: bool, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct ServerRbacConfig { | ||||||
|  |     pub create: bool, | ||||||
|  |     pub use_cluster_role: bool, | ||||||
|  |     pub namespaced: bool, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug, Clone)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct LabelSelector { | ||||||
|  |     #[serde(rename = "matchLabels")] | ||||||
|  |     pub match_labels: BTreeMap<String, String>, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #[derive(Serialize, Debug)] | ||||||
|  | pub struct EnabledConfig { | ||||||
|  |     pub enabled: bool, | ||||||
|  | } | ||||||
| @ -14,7 +14,7 @@ use crate::{ | |||||||
|     }, |     }, | ||||||
|     score::Score, |     score::Score, | ||||||
|     topology::{ |     topology::{ | ||||||
|         HelmCommand, Topology, |         HelmCommand, K8sclient, Topology, | ||||||
|         installable::Installable, |         installable::Installable, | ||||||
|         oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, |         oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, | ||||||
|         tenant::TenantManager, |         tenant::TenantManager, | ||||||
| @ -22,12 +22,12 @@ use crate::{ | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| use super::{ | use super::{ | ||||||
|     helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig, |     helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct Prometheus { | pub struct Prometheus { | ||||||
|     pub config: Arc<Mutex<PrometheusConfig>>, |     pub config: Arc<Mutex<HelmPrometheusConfig>>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #[async_trait] | #[async_trait] | ||||||
| @ -40,7 +40,7 @@ impl AlertSender for Prometheus { | |||||||
| impl Prometheus { | impl Prometheus { | ||||||
|     pub fn new() -> Self { |     pub fn new() -> Self { | ||||||
|         Self { |         Self { | ||||||
|             config: Arc::new(Mutex::new(PrometheusConfig::new())), |             config: Arc::new(Mutex::new(HelmPrometheusConfig::new())), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) { |     pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) { | ||||||
|  | |||||||
| @ -3,9 +3,8 @@ use crate::modules::monitoring::kube_prometheus::types::{ | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct PrometheusConfig { | pub struct HelmPrometheusConfig { | ||||||
|     pub namespace: Option<String>, |     pub namespace: Option<String>, | ||||||
|     pub default_rules: bool, |  | ||||||
|     pub alert_manager: bool, |     pub alert_manager: bool, | ||||||
|     pub node_exporter: bool, |     pub node_exporter: bool, | ||||||
|     pub kube_state_metrics: bool, |     pub kube_state_metrics: bool, | ||||||
| @ -16,11 +15,10 @@ pub struct PrometheusConfig { | |||||||
|     pub additional_service_monitors: Vec<ServiceMonitor>, |     pub additional_service_monitors: Vec<ServiceMonitor>, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| impl PrometheusConfig { | impl HelmPrometheusConfig { | ||||||
|     pub fn new() -> Self { |     pub fn new() -> Self { | ||||||
|         Self { |         Self { | ||||||
|             namespace: None, |             namespace: None, | ||||||
|             default_rules: true, |  | ||||||
|             alert_manager: true, |             alert_manager: true, | ||||||
|             node_exporter: false, |             node_exporter: false, | ||||||
|             kube_state_metrics: false, |             kube_state_metrics: false, | ||||||
|  | |||||||
| @ -1 +1,2 @@ | |||||||
| pub mod pvc; | pub mod pvc; | ||||||
|  | pub mod pod; | ||||||
|  | |||||||
							
								
								
									
										38
									
								
								harmony/src/modules/prometheus/alerts/k8s/pod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								harmony/src/modules/prometheus/alerts/k8s/pod.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,38 @@ | |||||||
|  | use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; | ||||||
|  | 
 | ||||||
|  | pub fn pod_in_failed_state() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new( | ||||||
|  |         "PodInFailedState", | ||||||
|  |         // This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true).
 | ||||||
|  |         "kube_pod_status_phase{phase=\"Failed\"} == 1", | ||||||
|  |     ) | ||||||
|  |     .for_duration("1m") // Fire if the pod is in this state for 1 minute.
 | ||||||
|  |     .label("severity", "critical") // A failed pod is a critical issue.
 | ||||||
|  |     .annotation( | ||||||
|  |         "summary", | ||||||
|  |         "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.", | ||||||
|  |     ) | ||||||
|  |     .annotation( | ||||||
|  |         "description", | ||||||
|  |         "The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.", | ||||||
|  |     ) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | pub fn pod_restarting_frequently() -> PrometheusAlertRule { | ||||||
|  |     PrometheusAlertRule::new( | ||||||
|  |         "PodRestartingFrequently", | ||||||
|  |         // This expression calculates the increase in the restart count over the last 30 minutes.
 | ||||||
|  |         // Alert if a container has restarted more than 5 times.
 | ||||||
|  |         "increase(kube_pod_container_status_restarts_total[30m]) > 5", | ||||||
|  |     ) | ||||||
|  |     .for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps.
 | ||||||
|  |     .label("severity", "critical") // A crash-looping pod is effectively down.
 | ||||||
|  |     .annotation( | ||||||
|  |         "summary", | ||||||
|  |         "Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.", | ||||||
|  |     ) | ||||||
|  |     .annotation( | ||||||
|  |         "description", | ||||||
|  |         "The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.", | ||||||
|  |     ) | ||||||
|  | } | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user