feat: added default prometheus rules and grafana dashboard for application monitoring
All checks were successful
Run Check Script / check (pull_request) Successful in -32s

This commit is contained in:
Willem 2025-07-22 13:26:03 -04:00
parent 1d8b503bd2
commit b9e208f4cf
14 changed files with 400 additions and 105 deletions

View File

@ -1,10 +1,8 @@
use std::sync::Arc;
use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore;
use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver;
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{
build_rule_container_restarting, build_rule_pod_failed,
};
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup;
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
@ -39,7 +37,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application");
let namespace = self.application.name().clone();
let mut alerting_score = HelmPrometheusApplicationAlertingScore {
let mut alerting_score = CRDApplicationAlertingScore {
namespace: namespace.clone(),
receivers: self.alert_receiver.clone(),
service_monitors: self.service_monitors.clone(),
@ -92,9 +90,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
alerting_score.receivers.push(Box::new(ntfy_receiver));
//TODO add service monitors to PrometheusApplicationMonitoring which can be
//deployed for the namespace using prometheus crd-servicemonitors
let mut service_monitor = ServiceMonitor {
let service_monitor = ServiceMonitor {
metadata: ObjectMeta {
name: Some(self.application.name().clone()),
labels: Some(std::collections::BTreeMap::from([
@ -110,22 +106,12 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
},
spec: ServiceMonitorSpec::default(),
};
let service_mon_endpoint = ServiceMonitorEndpoint {
port: Some("http".into()),
interval: Some("30s".into()),
path: Some("/metrics".into()),
scheme: None,
relabelings: vec![],
metric_relabelings: vec![],
};
service_monitor.spec.endpoints.push(service_mon_endpoint);
alerting_score.service_monitors.push(service_monitor);
let rules_group = RuleGroup {
name: format!("{}-rules", self.application.name().clone()),
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
rules: build_default_application_rules(),
};
alerting_score.prometheus_rules.push(rules_group);

View File

@ -19,6 +19,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
};
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::{
@ -37,16 +38,16 @@ use crate::{
};
#[derive(Clone, Debug, Serialize)]
pub struct HelmPrometheusApplicationAlertingScore {
pub struct CRDApplicationAlertingScore {
pub namespace: String,
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScore {
impl<T: Topology + K8sclient> Score<T> for CRDApplicationAlertingScore {
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
Box::new(HelmPrometheusApplicationAlertingInterpret {
Box::new(CRDApplicationAlertingInterpret {
namespace: self.namespace.clone(),
receivers: self.receivers.clone(),
service_monitors: self.service_monitors.clone(),
@ -55,12 +56,12 @@ impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScor
}
fn name(&self) -> String {
"HelmPrometheusApplicationAlertingScore".into()
"CRDApplicationAlertingScore".into()
}
}
#[derive(Clone, Debug)]
pub struct HelmPrometheusApplicationAlertingInterpret {
pub struct CRDApplicationAlertingInterpret {
pub namespace: String,
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
pub service_monitors: Vec<ServiceMonitor>,
@ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret {
}
#[async_trait]
impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlertingInterpret {
impl<T: Topology + K8sclient> Interpret<T> for CRDApplicationAlertingInterpret {
async fn execute(
&self,
_inventory: &Inventory,
@ -85,7 +86,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
self.install_monitors(self.service_monitors.clone(), &client)
.await?;
Ok(Outcome::success(format!(
"deployed application monitoring composants channels"
"deployed application monitoring composants"
)))
}
@ -106,7 +107,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
}
}
impl HelmPrometheusApplicationAlertingInterpret {
impl CRDApplicationAlertingInterpret {
async fn crd_exists(&self, crd: &str) -> bool {
let output = Command::new("kubectl")
.args(["get", "crd", crd])
@ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret {
json_data.insert("timeInterval".to_string(), "5s".to_string());
let namespace = self.namespace.clone();
let json = format!(
r#"{{
"title": "UP Status Dashboard",
"timezone": "browser",
"panels": [
{{
"type": "table",
"title": "Service UP Status",
"gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }},
"targets": [
{{
"expr": "up{{namespace=\"{namespace}\"}}",
"format": "table",
"refId": "A"
}}
],
"options": {{
"showHeader": true
}},
"fieldConfig": {{
"defaults": {{
"custom": {{}}
}},
"overrides": []
}}
}}
],
"schemaVersion": 30,
"version": 1
}}"#
);
let json = build_default_dashboard(&namespace);
let graf_data_source = GrafanaDatasource {
metadata: ObjectMeta {
name: Some(self.namespace.clone()),
name: Some(format!("grafana-datasource-{}", self.namespace.clone())),
namespace: Some(self.namespace.clone()),
..Default::default()
},
@ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
let graf_dashboard = GrafanaDashboard {
metadata: ObjectMeta {
name: Some(self.namespace.clone()),
name: Some(format!("grafana-dashboard-{}", self.namespace.clone())),
namespace: Some(self.namespace.clone()),
..Default::default()
},
@ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
let grafana = Grafana {
metadata: ObjectMeta {
name: Some(self.namespace.clone()),
name: Some(format!("grafana-{}", self.namespace.clone())),
namespace: Some(self.namespace.clone()),
labels: Some(label.clone()),
..Default::default()

View File

@ -1,2 +1,2 @@
pub mod helm_prometheus_application_alerting;
pub mod crd_application_monitoring_alerting;
pub mod k8s_application_monitoring_score;

View File

@ -1,38 +1,30 @@
use std::collections::BTreeMap;
use crate::modules::{
monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
prometheus::alerts::k8s::{
deployment::alert_deployment_unavailable,
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
pvc::high_pvc_fill_rate_over_two_days,
service::alert_service_down,
},
};
use super::crd_prometheus_rules::Rule;
pub fn build_rule_container_restarting() -> Rule {
Rule {
alert: Some("ContainerRestarting".into()),
expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()),
for_: Some("5m".into()),
labels: Some(BTreeMap::from([("severity".into(), "warning".into())])),
annotations: Some(BTreeMap::from([
(
"summary".into(),
"Container is restarting frequently".into(),
),
(
"description".into(),
"Container in this namespace is restarting more than 3 times in 5 minutes.".into(),
),
])),
}
}
pub fn build_rule_pod_failed() -> Rule {
Rule {
alert: Some("PodFailed".into()),
expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()),
for_: Some("0m".into()),
labels: Some(BTreeMap::from([("severity".into(), "critical".into())])),
annotations: Some(BTreeMap::from([
("summary".into(), "A pod has failed".into()),
(
"description".into(),
"One or more pods are in Failed phase.".into(),
),
])),
}
pub fn build_default_application_rules() -> Vec<Rule> {
let pod_failed: Rule = pod_failed().into();
let container_restarting: Rule = alert_container_restarting().into();
let pod_not_ready: Rule = alert_pod_not_ready().into();
let service_down: Rule = alert_service_down().into();
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
vec![
pod_failed,
container_restarting,
pod_not_ready,
service_down,
deployment_unavailable,
high_pvc_fill_rate,
]
}

View File

@ -4,8 +4,6 @@ use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::types::Operator;
use super::crd_prometheuses::LabelSelector;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]

View File

@ -1,8 +1,12 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
use super::crd_default_rules::build_default_application_rules;
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
#[kube(
@ -42,13 +46,14 @@ pub struct Rule {
pub annotations: Option<std::collections::BTreeMap<String, String>>,
}
impl PrometheusRuleSpec {
pub fn with_default_rules() -> Self {
PrometheusRuleSpec {
groups: vec![RuleGroup {
name: "default.rules".into(),
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
}],
impl From<PrometheusAlertRule> for Rule {
fn from(value: PrometheusAlertRule) -> Self {
Rule {
alert: Some(value.alert),
expr: Some(value.expr),
for_: value.r#for,
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
}
}
}

View File

@ -0,0 +1,203 @@
pub fn build_default_dashboard(namespace: &str) -> String {
let dashboard = format!(
r#"{{
"annotations": {{
"list": []
}},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 171105,
"panels": [
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 0,
"y": 0
}},
"id": 1,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Namespace",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 6,
"y": 0
}},
"id": 2,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Failed State",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "percentunit"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 6
}},
"id": 3,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Deployment Health (Available / Desired)",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 12
}},
"id": 4,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
"legendFormat": "{{{{pod}}}}",
"refId": "A"
}}
],
"title": "Container Restarts (per pod)",
"type": "timeseries"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 18
}},
"id": 5,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Firing Alerts in Namespace",
"type": "stat"
}}
],
"schemaVersion": 36,
"templating": {{
"list": [
{{
"name": "datasource",
"type": "datasource",
"pluginId": "prometheus",
"label": "Prometheus",
"query": "prometheus",
"refresh": 1,
"hide": 0,
"current": {{
"selected": true,
"text": "Prometheus",
"value": "Prometheus"
}}
}}
]
}},
"title": "Tenant Namespace Overview",
"version": 1
}}"#
);
dashboard
}

View File

@ -4,6 +4,7 @@ pub mod crd_default_rules;
pub mod crd_grafana;
pub mod crd_prometheus_rules;
pub mod crd_prometheuses;
pub mod grafana_default_dashboard;
pub mod grafana_operator;
pub mod prometheus_operator;
pub mod role;

View File

@ -58,6 +58,7 @@ config:
# web-root: "disable"
enable-signup: false
enable-login: "true"
enable-metrics: "true"
persistence:
enabled: true

View File

@ -0,0 +1,23 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "DeploymentUnavailable".into(),
expr: "kube_deployment_status_replicas_unavailable > 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Deployment has unavailable replicas".into(),
),
(
"description".into(),
"A deployment in this namespace has unavailable replicas for over 2 minutes."
.into(),
),
]),
}
}

View File

@ -0,0 +1,37 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_high_memory_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighMemoryUsage".into(),
expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
.into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high memory".into()),
(
"description".into(),
"A pod is consuming more than 500Mi of memory.".into(),
),
]),
}
}
pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighCPUUsage".into(),
expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
.into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high CPU".into()),
(
"description".into(),
"A pod is using more than 90% of a core over 1 minute.".into(),
),
]),
}
}

View File

@ -1 +1,5 @@
pub mod deployment;
pub mod memory_usage;
pub mod pod;
pub mod pvc;
pub mod service;

View File

@ -0,0 +1,55 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn pod_failed() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodFailed".into(),
expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "A pod has failed".into()),
(
"description".into(),
"One or more pods are in Failed phase.".into(),
),
]),
}
}
pub fn alert_container_restarting() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ContainerRestarting".into(),
expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
r#for: Some("5m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Container is restarting frequently".into(),
),
(
"description".into(),
"A container in this namespace has restarted more than 3 times in 5 minutes."
.into(),
),
]),
}
}
pub fn alert_pod_not_ready() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodNotReady".into(),
expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is not ready".into()),
(
"description".into(),
"A pod in the namespace is not reporting Ready status.".into(),
),
]),
}
}

View File

@ -0,0 +1,19 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_service_down() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ServiceDown".into(),
expr: "up == 0".into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "Service is down".into()),
(
"description".into(),
"A target service in the namespace is not responding to Prometheus scrapes.".into(),
),
]),
}
}