From 114219385f384a320c15553d8197b9546f2a0bca Mon Sep 17 00:00:00 2001 From: Willem Date: Wed, 16 Jul 2025 15:56:00 -0400 Subject: [PATCH] wip:added impl for prometheuses, alertmanagers, prometheusrules, servicemonitors, and some default rules that are deployed for application monitor working on implementing grafana crds via grafana operator need to link prometheus rules and alert managers in prometheus, testing it shows that prometheus isnt detecting them automatically --- examples/monitoring/src/main.rs | 4 +- examples/monitoring_with_tenant/src/main.rs | 4 +- examples/rust/src/main.rs | 6 +- .../application/features/monitoring.rs | 55 ++++-- .../kube_prometheus/alert_manager_config.rs | 2 - .../kube_prometheus/crd/crd_alertmanagers.rs | 53 ++++++ .../kube_prometheus/crd/crd_default_rules.rs | 38 ++++ .../kube_prometheus/crd/crd_grafana.rs | 170 +++++++++++++++++ .../crd/crd_prometheus_rules.rs | 54 ++++++ .../kube_prometheus/crd/crd_prometheuses.rs | 78 ++++++++ .../kube_prometheus/crd/grafana_operator.rs | 22 +++ .../monitoring/kube_prometheus/crd/mod.rs | 7 + .../monitoring/kube_prometheus/crd/role.rs | 62 +++++++ .../helm_prometheus_application_alerting.rs | 172 +++++++++++++++++- .../modules/monitoring/kube_prometheus/mod.rs | 2 + .../kube_prometheus/service_monitor.rs | 89 +++++++++ .../monitoring/kube_prometheus/types.rs | 127 +++++++------ 17 files changed, 856 insertions(+), 89 deletions(-) create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanagers.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/crd/role.rs create mode 100644 harmony/src/modules/monitoring/kube_prometheus/service_monitor.rs diff --git a/examples/monitoring/src/main.rs b/examples/monitoring/src/main.rs index 989b1ec..6d3a5b0 100644 --- a/examples/monitoring/src/main.rs +++ b/examples/monitoring/src/main.rs @@ -51,8 +51,8 @@ async fn main() { let service_monitor_endpoint = ServiceMonitorEndpoint { port: Some("80".to_string()), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, + path: Some("/metrics".to_string()), + scheme: Some(HTTPScheme::HTTP), ..Default::default() }; diff --git a/examples/monitoring_with_tenant/src/main.rs b/examples/monitoring_with_tenant/src/main.rs index ec80542..035d071 100644 --- a/examples/monitoring_with_tenant/src/main.rs +++ b/examples/monitoring_with_tenant/src/main.rs @@ -54,8 +54,8 @@ async fn main() { let service_monitor_endpoint = ServiceMonitorEndpoint { port: Some("80".to_string()), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, + path: Some("/metrics".to_string()), + scheme: Some(HTTPScheme::HTTP), ..Default::default() }; diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs index f39fb9f..d2ff2ff 100644 --- a/examples/rust/src/main.rs +++ b/examples/rust/src/main.rs @@ -6,7 +6,7 @@ use harmony::{ modules::{ application::{ ApplicationScore, RustWebFramework, RustWebapp, - features::{ContinuousDelivery, PrometheusApplicationMonitoring}, + features::{ContinuousDelivery, Monitoring}, }, monitoring::alert_channel::{ discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver, @@ -46,9 +46,11 @@ async fn main() { Box::new(ContinuousDelivery { application: application.clone(), }), // TODO add monitoring, backups, multisite ha, etc - Box::new(PrometheusApplicationMonitoring { + Box::new(Monitoring { application: application.clone(), alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)], + service_monitors: vec![], + alert_rules: vec![], }), // TODO add backups, multisite ha, etc ], diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index cef26e3..ffd7783 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -1,9 +1,13 @@ use std::sync::Arc; -use async_trait::async_trait; -use base64::{Engine as _, engine::general_purpose}; -use log::{debug, info}; - +use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{ + build_rule_container_restarting, build_rule_pod_failed, +}; +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup; +use crate::modules::monitoring::kube_prometheus::service_monitor::{ + ServiceMonitor, ServiceMonitorSpec, +}; +use crate::modules::monitoring::kube_prometheus::types::{Selector, ServiceMonitorEndpoint}; use crate::{ inventory::Inventory, modules::{ @@ -13,7 +17,7 @@ use crate::{ kube_prometheus::{ alert_manager_config::{CRDAlertManager, CRDAlertManagerReceiver}, helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore, - types::{NamespaceSelector, ServiceMonitor}, + types::{NamespaceSelector, ServiceMonitor as KubePrometheusServiceMonitor}, }, ntfy::ntfy::NtfyScore, }, @@ -24,23 +28,31 @@ use crate::{ tenant::TenantManager, }, }; +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use kube::api::ObjectMeta; +use log::{debug, info}; #[derive(Debug, Clone)] -pub struct PrometheusApplicationMonitoring { +pub struct Monitoring { pub application: Arc, pub alert_receiver: Vec>, + pub service_monitors: Vec, + pub alert_rules: Vec, } #[async_trait] impl - ApplicationFeature for PrometheusApplicationMonitoring + ApplicationFeature for Monitoring { async fn ensure_installed(&self, topology: &T) -> Result<(), String> { info!("Ensuring monitoring is available for application"); - + let namespace = self.application.name().clone(); let mut alerting_score = HelmPrometheusApplicationAlertingScore { - namespace: self.application.name().clone(), + namespace: namespace.clone(), receivers: self.alert_receiver.clone(), + service_monitors: self.service_monitors.clone(), + prometheus_rules: self.alert_rules.clone(), }; let ntfy = NtfyScore { // namespace: topology @@ -91,14 +103,27 @@ impl, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alertmanager_config_namespace_selector: Option, + + /// Optional pod template metadata (annotations, labels) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_metadata: Option, + + /// Optional topology spread settings + #[serde(default, skip_serializing_if = "Option::is_none")] + pub version: Option, +} + +impl Default for AlertmanagerSpec { + fn default() -> Self { + AlertmanagerSpec { + replicas: 1, + + // Match all AlertmanagerConfigs in the same namespace + alertmanager_config_namespace_selector: None, + + // Empty selector matches all AlertmanagerConfigs in that namespace + alertmanager_config_selector: Some(LabelSelector::default()), + + pod_metadata: None, + version: None, + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs new file mode 100644 index 0000000..1feb5ed --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs @@ -0,0 +1,38 @@ +use std::collections::BTreeMap; + +use super::crd_prometheus_rules::Rule; + +pub fn build_rule_container_restarting() -> Rule { + Rule { + alert: Some("ContainerRestarting".into()), + expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()), + for_: Some("5m".into()), + labels: Some(BTreeMap::from([("severity".into(), "warning".into())])), + annotations: Some(BTreeMap::from([ + ( + "summary".into(), + "Container is restarting frequently".into(), + ), + ( + "description".into(), + "Container in this namespace is restarting more than 3 times in 5 minutes.".into(), + ), + ])), + } +} + +pub fn build_rule_pod_failed() -> Rule { + Rule { + alert: Some("PodFailed".into()), + expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()), + for_: Some("0m".into()), + labels: Some(BTreeMap::from([("severity".into(), "critical".into())])), + annotations: Some(BTreeMap::from([ + ("summary".into(), "A pod has failed".into()), + ( + "description".into(), + "One or more pods are in Failed phase.".into(), + ), + ])), + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs new file mode 100644 index 0000000..18074b8 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -0,0 +1,170 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "Grafana", + plural = "grafanas", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub config: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ingress: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub persistence: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resources: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub log: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub security: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaLogConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mode: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub level: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaSecurityConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_user: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub admin_password: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngress { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hosts: Option>, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaPersistence { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub enabled: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub storage_class_name: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub size: Option, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDashboard", + plural = "grafanadashboards", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDashboardSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub resync_period: Option, + + pub instance_selector: LabelSelector, + + pub json: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "grafana.integreatly.org", + version = "v1beta1", + kind = "GrafanaDatasource", + plural = "grafanadatasources", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceSpec { + pub instance_selector: LabelSelector, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub allow_cross_namespace_import: Option, + + pub datasource: GrafanaDatasourceConfig, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDatasourceConfig { + pub access: String, + pub database: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub json_data: Option>, + pub name: String, + pub r#type: String, + pub url: String, +} + +// ------------------------------------------------------------------------------------------------ + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub match_labels: BTreeMap, + + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_expressions: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelectorRequirement { + pub key: String, + pub operator: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub values: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct ResourceRequirements { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub limits: BTreeMap, + + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub requests: BTreeMap, +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs new file mode 100644 index 0000000..49bee3b --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs @@ -0,0 +1,54 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed}; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "PrometheusRule", + plural = "prometheusrules", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusRuleSpec { + pub groups: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct RuleGroup { + pub name: String, + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct Rule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alert: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expr: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub for_: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option>, +} + +impl PrometheusRuleSpec { + pub fn with_default_rules() -> Self { + PrometheusRuleSpec { + groups: vec![RuleGroup { + name: "default.rules".into(), + rules: vec![build_rule_container_restarting(), build_rule_pod_failed()], + }], + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs new file mode 100644 index 0000000..0b9101f --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheuses.rs @@ -0,0 +1,78 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::kube_prometheus::types::Operator; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "Prometheus", + plural = "prometheuses", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct PrometheusSpec { + pub service_account_name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_namespace_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub service_discovery_role: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_monitor_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_selector: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rule_namespace_selector: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] + pub match_labels: BTreeMap, + + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub match_expressions: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelectorRequirement { + pub key: String, + pub operator: Operator, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub values: Vec, +} + +impl Default for PrometheusSpec { + fn default() -> Self { + PrometheusSpec { + service_account_name: "prometheus".into(), + + // null means "only my namespace" + service_monitor_namespace_selector: None, + + // empty selector means match all ServiceMonitors in that namespace + service_monitor_selector: Some(LabelSelector::default()), + + service_discovery_role: Some("Endpoints".into()), + + pod_monitor_selector: None, + + rule_selector: None, + + rule_namespace_selector: Some(LabelSelector::default()), + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs new file mode 100644 index 0000000..00864cd --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs @@ -0,0 +1,22 @@ +use std::str::FromStr; + +use non_blank_string_rs::NonBlankString; + +use crate::modules::helm::chart::HelmChartScore; + +pub fn grafana_operator_helm_chart_score(ns: String) -> HelmChartScore { + HelmChartScore { + namespace: Some(NonBlankString::from_str(&ns).unwrap()), + release_name: NonBlankString::from_str("kube-prometheus").unwrap(), + chart_name: NonBlankString::from_str( + "grafana-operator oci://ghcr.io/grafana/helm-charts/grafana-operator", + ) + .unwrap(), + chart_version: None, + values_overrides: None, + values_yaml: None, + create_namespace: true, + install_only: true, + repository: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs new file mode 100644 index 0000000..ca8a12d --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs @@ -0,0 +1,7 @@ +pub mod crd_alertmanagers; +pub mod crd_default_rules; +pub mod crd_grafana; +pub mod crd_prometheus_rules; +pub mod crd_prometheuses; +pub mod grafana_operator; +pub mod role; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs new file mode 100644 index 0000000..9add9a9 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/role.rs @@ -0,0 +1,62 @@ +use k8s_openapi::api::{ + core::v1::ServiceAccount, + rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject}, +}; +use kube::api::ObjectMeta; + +pub fn build_prom_role(role_name: String, namespace: String) -> Role { + Role { + metadata: ObjectMeta { + name: Some(role_name), + namespace: Some(namespace), + ..Default::default() + }, + rules: Some(vec![PolicyRule { + api_groups: Some(vec!["".into()]), // core API group + resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]), + verbs: vec!["get".into(), "list".into(), "watch".into()], + ..Default::default() + }]), + } +} + +pub fn build_prom_rolebinding( + role_name: String, + namespace: String, + service_account_name: String, +) -> RoleBinding { + RoleBinding { + metadata: ObjectMeta { + name: Some(format!("{}-rolebinding", role_name)), + namespace: Some(namespace.clone()), + ..Default::default() + }, + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".into(), + kind: "Role".into(), + name: role_name, + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".into(), + name: service_account_name, + namespace: Some(namespace.clone()), + ..Default::default() + }]), + } +} + +pub fn build_prom_service_account( + service_account_name: String, + namespace: String, +) -> ServiceAccount { + ServiceAccount { + automount_service_account_token: None, + image_pull_secrets: None, + metadata: ObjectMeta { + name: Some(service_account_name), + namespace: Some(namespace), + ..Default::default() + }, + secrets: None, + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_application_alerting.rs b/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_application_alerting.rs index bbc59d7..1765a64 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_application_alerting.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_application_alerting.rs @@ -1,20 +1,28 @@ +use std::sync::Arc; + use async_trait::async_trait; use kube::{Api, api::ObjectMeta}; -use log::debug; +use log::{debug, info}; use serde::Serialize; use crate::{ data::{Id, Version}, interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, inventory::Inventory, + modules::monitoring::kube_prometheus::crd::{ + crd_alertmanagers::{Alertmanager, AlertmanagerSpec}, + crd_prometheuses::{Prometheus, PrometheusSpec}, + role::{build_prom_role, build_prom_rolebinding, build_prom_service_account}, + }, score::Score, - topology::{K8sclient, Topology, oberservability::monitoring::AlertReceiver}, + topology::{K8sclient, Topology, k8s::K8sClient, oberservability::monitoring::AlertReceiver}, }; use super::{ alert_manager_config::{ AlertmanagerConfig, AlertmanagerConfigSpec, CRDAlertManager, CRDAlertManagerReceiver, }, + crd::crd_prometheus_rules::{PrometheusRule, PrometheusRuleSpec, RuleGroup}, prometheus::KubePrometheus, }; @@ -22,6 +30,8 @@ use super::{ pub struct HelmPrometheusApplicationAlertingScore { pub namespace: String, pub receivers: Vec>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, } impl Score for HelmPrometheusApplicationAlertingScore { @@ -29,6 +39,8 @@ impl Score for HelmPrometheusApplicationAlertingScor Box::new(HelmPrometheusApplicationAlertingInterpret { namespace: self.namespace.clone(), receivers: self.receivers.clone(), + service_monitors: self.service_monitors.clone(), + prometheus_rules: self.prometheus_rules.clone(), }) } @@ -41,6 +53,8 @@ impl Score for HelmPrometheusApplicationAlertingScor pub struct HelmPrometheusApplicationAlertingInterpret { pub namespace: String, pub receivers: Vec>, + pub service_monitors: Vec, + pub prometheus_rules: Vec, } #[async_trait] @@ -51,6 +65,8 @@ impl Interpret for HelmPrometheusApplicationAlerting topology: &T, ) -> Result { let client = topology.k8s_client().await.unwrap(); + self.install_prometheus(&client).await?; + self.install_alert_manager(&client).await?; for receiver in self.receivers.iter() { let alertmanager_config: AlertmanagerConfig = receiver .configure_receiver(&client, self.namespace.clone()) @@ -64,6 +80,15 @@ impl Interpret for HelmPrometheusApplicationAlerting InterpretError::new(format!("failed to install receiver: {}", err)) })?; } + self.install_rules(self.prometheus_rules.clone(), client.clone()) + .await + .map_err(|err| InterpretError::new(format!("failed to install rules: {}", err)))?; + + debug!("\n\n\n monitors: {:#?}", self.service_monitors.clone()); + for monitor in self.service_monitors.iter() { + self.install_monitor(monitor.clone(), client.clone()) + .await?; + } Ok(Outcome::success(format!("deployed alert channels"))) } @@ -83,3 +108,146 @@ impl Interpret for HelmPrometheusApplicationAlerting todo!() } } + +impl HelmPrometheusApplicationAlertingInterpret { + async fn install_prometheus(&self, client: &Arc) -> Result { + debug!( + "installing crd-prometheuses in namespace {}", + self.namespace.clone() + ); + debug!("building role/rolebinding/serviceaccount for crd-prometheus"); + let rolename = format!("{}-prom", self.namespace.clone()); + let sa_name = format!("{}-prom-sa", self.namespace.clone()); + let role = build_prom_role(rolename.clone(), self.namespace.clone()); + let rolebinding = + build_prom_rolebinding(rolename.clone(), self.namespace.clone(), sa_name.clone()); + let sa = build_prom_service_account(sa_name.clone(), self.namespace.clone()); + let mut prom_spec = PrometheusSpec::default(); + prom_spec.service_account_name = sa_name.clone(); + let prom = Prometheus { + metadata: ObjectMeta { + name: Some(self.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(self.namespace.clone()), + ..Default::default() + }, + spec: prom_spec, + }; + client + .apply(&role, Some(&self.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus role: {:#?} in ns {:#?}", + role.metadata.name.unwrap(), + role.metadata.namespace.unwrap() + ); + client + .apply(&rolebinding, Some(&self.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus rolebinding: {:#?} in ns {:#?}", + rolebinding.metadata.name.unwrap(), + rolebinding.metadata.namespace.unwrap() + ); + client + .apply(&sa, Some(&self.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus service account: {:#?} in ns {:#?}", + sa.metadata.name.unwrap(), + sa.metadata.namespace.unwrap() + ); + client + .apply(&prom, Some(&self.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + info!( + "installed prometheus: {:#?} in ns {:#?}", + &prom.metadata.name.clone().unwrap(), + &prom.metadata.namespace.clone().unwrap() + ); + + Ok(Outcome::success(format!( + "successfully deployed crd-prometheus {:#?}", + prom + ))) + } + + async fn install_alert_manager( + &self, + client: &Arc, + ) -> Result { + let am = Alertmanager { + metadata: ObjectMeta { + name: Some(self.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(self.namespace.clone()), + ..Default::default() + }, + spec: AlertmanagerSpec::default(), + }; + client + .apply(&am, Some(&self.namespace.clone())) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed service monitor {:#?}", + am.metadata.name + ))) + } + + async fn install_monitor( + &self, + monitor: super::service_monitor::ServiceMonitor, + client: Arc, + ) -> Result { + debug!("service monitor: \n{:#?}", monitor.clone()); + let namespace = self.namespace.clone(); + client + .apply(&monitor, Some(&namespace)) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed service monitor {:#?}", + monitor.metadata.name + ))) + } + + async fn install_rules( + &self, + rules: Vec, + client: Arc, + ) -> Result { + let prom_rule_spec = PrometheusRuleSpec { groups: rules }; + + let prom_rules = PrometheusRule { + metadata: ObjectMeta { + name: Some(self.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + namespace: Some(self.namespace.clone()), + ..Default::default() + }, + spec: prom_rule_spec, + }; + client + .apply(&prom_rules, Some(&self.namespace)) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + Ok(Outcome::success(format!( + "successfully deployed service monitor {:#?}", + prom_rules.metadata.name + ))) + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/mod.rs index 4843509..5fc0fea 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/mod.rs @@ -1,6 +1,8 @@ pub mod alert_manager_config; +pub mod crd; pub mod helm; pub mod helm_prometheus_alert_score; pub mod helm_prometheus_application_alerting; pub mod prometheus; +pub mod service_monitor; pub mod types; diff --git a/harmony/src/modules/monitoring/kube_prometheus/service_monitor.rs b/harmony/src/modules/monitoring/kube_prometheus/service_monitor.rs new file mode 100644 index 0000000..049e32d --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/service_monitor.rs @@ -0,0 +1,89 @@ +use std::collections::{BTreeMap, HashMap}; + +use kube::{CustomResource, Resource, api::ObjectMeta}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::interpret::InterpretError; + +use super::types::{ + HTTPScheme, MatchExpression, NamespaceSelector, Operator, Selector, + ServiceMonitor as KubeServiceMonitor, ServiceMonitorEndpoint, +}; + +/// This is the top-level struct for the ServiceMonitor Custom Resource. +/// The `#[derive(CustomResource)]` macro handles all the boilerplate for you, +/// including the `impl Resource`. +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "ServiceMonitor", + plural = "servicemonitors", + namespaced +)] +#[serde(rename_all = "camelCase")] +pub struct ServiceMonitorSpec { + /// A label selector to select services to monitor. + pub selector: Selector, + + /// A list of endpoints on the selected services to be monitored. + pub endpoints: Vec, + + /// Selector to select which namespaces the Kubernetes Endpoints objects + /// are discovered from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub namespace_selector: Option, + + /// The label to use to retrieve the job name from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub job_label: Option, + + /// Pod-based target labels to transfer from the Kubernetes Pod onto the target. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub pod_target_labels: Vec, + + /// TargetLabels transfers labels on the Kubernetes Service object to the target. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub target_labels: Vec, +} + +impl Default for ServiceMonitorSpec { + fn default() -> Self { + let mut labels = HashMap::new(); + Self { + selector: Selector { + match_labels: { labels }, + match_expressions: vec![MatchExpression { + key: "app.kubernetes.io/name".into(), + operator: Operator::Exists, + values: vec![], + }], + }, + endpoints: vec![ServiceMonitorEndpoint { + port: Some("http".to_string()), + path: Some("/metrics".into()), + interval: Some("30s".into()), + scheme: Some(HTTPScheme::HTTP), + ..Default::default() + }], + namespace_selector: None, // only the same namespace + job_label: Some("app".into()), + pod_target_labels: vec![], + target_labels: vec![], + } + } +} + +impl From for ServiceMonitorSpec { + fn from(value: KubeServiceMonitor) -> Self { + Self { + selector: value.selector, + endpoints: value.endpoints, + namespace_selector: value.namespace_selector, + job_label: value.job_label, + pod_target_labels: value.pod_target_labels, + target_labels: value.target_labels, + } + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/types.rs b/harmony/src/modules/monitoring/kube_prometheus/types.rs index c9209dc..2423e5d 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/types.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs @@ -1,7 +1,8 @@ use std::collections::{BTreeMap, HashMap}; use async_trait::async_trait; -use serde::Serialize; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use serde_yaml::{Mapping, Sequence, Value}; use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; @@ -94,7 +95,7 @@ pub struct AlertGroup { pub groups: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub enum HTTPScheme { #[serde(rename = "http")] HTTP, @@ -102,7 +103,7 @@ pub enum HTTPScheme { HTTPS, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub enum Operator { In, NotIn, @@ -147,70 +148,79 @@ pub struct ServiceMonitorTLSConfig { pub server_name: Option, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] #[serde(rename_all = "camelCase")] pub struct ServiceMonitorEndpoint { - // ## Name of the endpoint's service port - // ## Mutually exclusive with targetPort + /// Name of the service port this endpoint refers to. pub port: Option, - // ## Name or number of the endpoint's target port - // ## Mutually exclusive with port - pub target_port: Option, - - // ## File containing bearer token to be used when scraping targets - // ## - pub bearer_token_file: Option, - - // ## Interval at which metrics should be scraped - // ## + /// Interval at which metrics should be scraped. + #[serde(default, skip_serializing_if = "Option::is_none")] pub interval: Option, - // ## HTTP path to scrape for metrics - // ## - pub path: String, + /// The HTTP path to scrape for metrics. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, - // ## HTTP scheme to use for scraping - // ## - pub scheme: HTTPScheme, + /// HTTP scheme to use for scraping. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scheme: Option, - // ## TLS configuration to use when scraping the endpoint - // ## - pub tls_config: Option, + /// Relabelings to apply to samples before scraping. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub relabelings: Vec, - // ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. - // ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig - // ## - // # - action: keep - // # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' - // # sourceLabels: [__name__] - pub metric_relabelings: Vec, - - // ## RelabelConfigs to apply to samples before scraping - // ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig - // ## - // # - sourceLabels: [__meta_kubernetes_pod_node_name] - // # separator: ; - // # regex: ^(.*)$ - // # targetLabel: nodename - // # replacement: $1 - // # action: replace - pub relabelings: Vec, + /// MetricRelabelings to apply to samples after scraping, but before ingestion. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub metric_relabelings: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct RelabelConfig { + /// The action to perform based on the regex matching. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub action: Option, + + /// A list of labels from which to extract values. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub source_labels: Vec, + + /// Separator to be used when concatenating source_labels. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub separator: Option, + + /// The label to which the resulting value is written. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_label: Option, + + /// A regular expression to match against the concatenated source label values. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub regex: Option, + + /// The replacement value to use. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replacement: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct MatchExpression { pub key: String, - pub operator: Operator, + pub operator: Operator, // "In", "NotIn", "Exists", "DoesNotExist" + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub values: Vec, } -#[derive(Debug, Clone, Serialize)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] #[serde(rename_all = "camelCase")] pub struct Selector { - // # label selector for services + /// A map of key-value pairs to match. + #[serde(default, skip_serializing_if = "HashMap::is_empty")] pub match_labels: HashMap, + + /// A list of label selector requirements. + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub match_expressions: Vec, } @@ -258,10 +268,15 @@ pub struct ServiceMonitor { pub fallback_scrape_protocol: Option, } -#[derive(Debug, Serialize, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct NamespaceSelector { + /// Select all namespaces. + #[serde(default, skip_serializing_if = "std::ops::Not::not")] pub any: bool, + + /// List of namespace names to select from. + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub match_names: Vec, } @@ -283,19 +298,3 @@ impl Default for ServiceMonitor { } } } - -impl Default for ServiceMonitorEndpoint { - fn default() -> Self { - Self { - port: Some("80".to_string()), - target_port: Default::default(), - bearer_token_file: Default::default(), - interval: Default::default(), - path: "/metrics".to_string(), - scheme: HTTPScheme::HTTP, - tls_config: Default::default(), - metric_relabelings: Default::default(), - relabelings: Default::default(), - } - } -}