feat: added default prometheus rules and grafana dashboard for application monitoring
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
This commit is contained in:
parent
1d8b503bd2
commit
b9e208f4cf
@ -1,10 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore;
|
||||
use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{
|
||||
build_rule_container_restarting, build_rule_pod_failed,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
|
||||
ServiceMonitor, ServiceMonitorSpec,
|
||||
@ -39,7 +37,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
||||
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
|
||||
info!("Ensuring monitoring is available for application");
|
||||
let namespace = self.application.name().clone();
|
||||
let mut alerting_score = HelmPrometheusApplicationAlertingScore {
|
||||
let mut alerting_score = CRDApplicationAlertingScore {
|
||||
namespace: namespace.clone(),
|
||||
receivers: self.alert_receiver.clone(),
|
||||
service_monitors: self.service_monitors.clone(),
|
||||
@ -92,9 +90,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
||||
|
||||
alerting_score.receivers.push(Box::new(ntfy_receiver));
|
||||
|
||||
//TODO add service monitors to PrometheusApplicationMonitoring which can be
|
||||
//deployed for the namespace using prometheus crd-servicemonitors
|
||||
let mut service_monitor = ServiceMonitor {
|
||||
let service_monitor = ServiceMonitor {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.application.name().clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
@ -110,22 +106,12 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
||||
},
|
||||
spec: ServiceMonitorSpec::default(),
|
||||
};
|
||||
let service_mon_endpoint = ServiceMonitorEndpoint {
|
||||
port: Some("http".into()),
|
||||
interval: Some("30s".into()),
|
||||
path: Some("/metrics".into()),
|
||||
scheme: None,
|
||||
relabelings: vec![],
|
||||
metric_relabelings: vec![],
|
||||
};
|
||||
|
||||
service_monitor.spec.endpoints.push(service_mon_endpoint);
|
||||
|
||||
alerting_score.service_monitors.push(service_monitor);
|
||||
|
||||
let rules_group = RuleGroup {
|
||||
name: format!("{}-rules", self.application.name().clone()),
|
||||
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
|
||||
rules: build_default_application_rules(),
|
||||
};
|
||||
|
||||
alerting_score.prometheus_rules.push(rules_group);
|
||||
|
@ -19,6 +19,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor;
|
||||
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
|
||||
use crate::{
|
||||
@ -37,16 +38,16 @@ use crate::{
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct HelmPrometheusApplicationAlertingScore {
|
||||
pub struct CRDApplicationAlertingScore {
|
||||
pub namespace: String,
|
||||
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
pub prometheus_rules: Vec<RuleGroup>,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScore {
|
||||
impl<T: Topology + K8sclient> Score<T> for CRDApplicationAlertingScore {
|
||||
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
||||
Box::new(HelmPrometheusApplicationAlertingInterpret {
|
||||
Box::new(CRDApplicationAlertingInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
receivers: self.receivers.clone(),
|
||||
service_monitors: self.service_monitors.clone(),
|
||||
@ -55,12 +56,12 @@ impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScor
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"HelmPrometheusApplicationAlertingScore".into()
|
||||
"CRDApplicationAlertingScore".into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct HelmPrometheusApplicationAlertingInterpret {
|
||||
pub struct CRDApplicationAlertingInterpret {
|
||||
pub namespace: String,
|
||||
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
@ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlertingInterpret {
|
||||
impl<T: Topology + K8sclient> Interpret<T> for CRDApplicationAlertingInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
_inventory: &Inventory,
|
||||
@ -85,7 +86,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
|
||||
self.install_monitors(self.service_monitors.clone(), &client)
|
||||
.await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"deployed application monitoring composants channels"
|
||||
"deployed application monitoring composants"
|
||||
)))
|
||||
}
|
||||
|
||||
@ -106,7 +107,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
|
||||
}
|
||||
}
|
||||
|
||||
impl HelmPrometheusApplicationAlertingInterpret {
|
||||
impl CRDApplicationAlertingInterpret {
|
||||
async fn crd_exists(&self, crd: &str) -> bool {
|
||||
let output = Command::new("kubectl")
|
||||
.args(["get", "crd", crd])
|
||||
@ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
||||
json_data.insert("timeInterval".to_string(), "5s".to_string());
|
||||
let namespace = self.namespace.clone();
|
||||
|
||||
let json = format!(
|
||||
r#"{{
|
||||
"title": "UP Status Dashboard",
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{{
|
||||
"type": "table",
|
||||
"title": "Service UP Status",
|
||||
"gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }},
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "up{{namespace=\"{namespace}\"}}",
|
||||
"format": "table",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"options": {{
|
||||
"showHeader": true
|
||||
}},
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"custom": {{}}
|
||||
}},
|
||||
"overrides": []
|
||||
}}
|
||||
}}
|
||||
],
|
||||
"schemaVersion": 30,
|
||||
"version": 1
|
||||
}}"#
|
||||
);
|
||||
let json = build_default_dashboard(&namespace);
|
||||
|
||||
let graf_data_source = GrafanaDatasource {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
name: Some(format!("grafana-datasource-{}", self.namespace.clone())),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
@ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
||||
|
||||
let graf_dashboard = GrafanaDashboard {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
name: Some(format!("grafana-dashboard-{}", self.namespace.clone())),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
@ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
||||
|
||||
let grafana = Grafana {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
name: Some(format!("grafana-{}", self.namespace.clone())),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
labels: Some(label.clone()),
|
||||
..Default::default()
|
@ -1,2 +1,2 @@
|
||||
pub mod helm_prometheus_application_alerting;
|
||||
pub mod crd_application_monitoring_alerting;
|
||||
pub mod k8s_application_monitoring_score;
|
||||
|
@ -1,38 +1,30 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::modules::{
|
||||
monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
|
||||
prometheus::alerts::k8s::{
|
||||
deployment::alert_deployment_unavailable,
|
||||
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
|
||||
pvc::high_pvc_fill_rate_over_two_days,
|
||||
service::alert_service_down,
|
||||
},
|
||||
};
|
||||
|
||||
use super::crd_prometheus_rules::Rule;
|
||||
|
||||
pub fn build_rule_container_restarting() -> Rule {
|
||||
Rule {
|
||||
alert: Some("ContainerRestarting".into()),
|
||||
expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()),
|
||||
for_: Some("5m".into()),
|
||||
labels: Some(BTreeMap::from([("severity".into(), "warning".into())])),
|
||||
annotations: Some(BTreeMap::from([
|
||||
(
|
||||
"summary".into(),
|
||||
"Container is restarting frequently".into(),
|
||||
),
|
||||
(
|
||||
"description".into(),
|
||||
"Container in this namespace is restarting more than 3 times in 5 minutes.".into(),
|
||||
),
|
||||
])),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_rule_pod_failed() -> Rule {
|
||||
Rule {
|
||||
alert: Some("PodFailed".into()),
|
||||
expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()),
|
||||
for_: Some("0m".into()),
|
||||
labels: Some(BTreeMap::from([("severity".into(), "critical".into())])),
|
||||
annotations: Some(BTreeMap::from([
|
||||
("summary".into(), "A pod has failed".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"One or more pods are in Failed phase.".into(),
|
||||
),
|
||||
])),
|
||||
}
|
||||
pub fn build_default_application_rules() -> Vec<Rule> {
|
||||
let pod_failed: Rule = pod_failed().into();
|
||||
let container_restarting: Rule = alert_container_restarting().into();
|
||||
let pod_not_ready: Rule = alert_pod_not_ready().into();
|
||||
let service_down: Rule = alert_service_down().into();
|
||||
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
|
||||
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
|
||||
vec![
|
||||
pod_failed,
|
||||
container_restarting,
|
||||
pod_not_ready,
|
||||
service_down,
|
||||
deployment_unavailable,
|
||||
high_pvc_fill_rate,
|
||||
]
|
||||
}
|
||||
|
@ -4,8 +4,6 @@ use kube::CustomResource;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::types::Operator;
|
||||
|
||||
use super::crd_prometheuses::LabelSelector;
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
|
@ -1,8 +1,12 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use kube::CustomResource;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed};
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
use super::crd_default_rules::build_default_application_rules;
|
||||
|
||||
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
|
||||
#[kube(
|
||||
@ -42,13 +46,14 @@ pub struct Rule {
|
||||
pub annotations: Option<std::collections::BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
impl PrometheusRuleSpec {
|
||||
pub fn with_default_rules() -> Self {
|
||||
PrometheusRuleSpec {
|
||||
groups: vec![RuleGroup {
|
||||
name: "default.rules".into(),
|
||||
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
|
||||
}],
|
||||
impl From<PrometheusAlertRule> for Rule {
|
||||
fn from(value: PrometheusAlertRule) -> Self {
|
||||
Rule {
|
||||
alert: Some(value.alert),
|
||||
expr: Some(value.expr),
|
||||
for_: value.r#for,
|
||||
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
|
||||
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,203 @@
|
||||
pub fn build_default_dashboard(namespace: &str) -> String {
|
||||
let dashboard = format!(
|
||||
r#"{{
|
||||
"annotations": {{
|
||||
"list": []
|
||||
}},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"iteration": 171105,
|
||||
"panels": [
|
||||
{{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"unit": "short"
|
||||
}},
|
||||
"overrides": []
|
||||
}},
|
||||
"gridPos": {{
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}},
|
||||
"id": 1,
|
||||
"options": {{
|
||||
"reduceOptions": {{
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}}
|
||||
}},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"title": "Pods in Namespace",
|
||||
"type": "stat"
|
||||
}},
|
||||
{{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"unit": "short"
|
||||
}},
|
||||
"overrides": []
|
||||
}},
|
||||
"gridPos": {{
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
}},
|
||||
"id": 2,
|
||||
"options": {{
|
||||
"reduceOptions": {{
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}}
|
||||
}},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"title": "Pods in Failed State",
|
||||
"type": "stat"
|
||||
}},
|
||||
{{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"unit": "percentunit"
|
||||
}},
|
||||
"overrides": []
|
||||
}},
|
||||
"gridPos": {{
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
}},
|
||||
"id": 3,
|
||||
"options": {{
|
||||
"reduceOptions": {{
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}}
|
||||
}},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"title": "Deployment Health (Available / Desired)",
|
||||
"type": "stat"
|
||||
}},
|
||||
{{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"unit": "short"
|
||||
}},
|
||||
"overrides": []
|
||||
}},
|
||||
"gridPos": {{
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
}},
|
||||
"id": 4,
|
||||
"options": {{
|
||||
"reduceOptions": {{
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}}
|
||||
}},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
|
||||
"legendFormat": "{{{{pod}}}}",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"title": "Container Restarts (per pod)",
|
||||
"type": "timeseries"
|
||||
}},
|
||||
{{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {{
|
||||
"defaults": {{
|
||||
"unit": "short"
|
||||
}},
|
||||
"overrides": []
|
||||
}},
|
||||
"gridPos": {{
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
}},
|
||||
"id": 5,
|
||||
"options": {{
|
||||
"reduceOptions": {{
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}}
|
||||
}},
|
||||
"pluginVersion": "9.0.0",
|
||||
"targets": [
|
||||
{{
|
||||
"expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}}
|
||||
],
|
||||
"title": "Firing Alerts in Namespace",
|
||||
"type": "stat"
|
||||
}}
|
||||
],
|
||||
"schemaVersion": 36,
|
||||
"templating": {{
|
||||
"list": [
|
||||
{{
|
||||
"name": "datasource",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"label": "Prometheus",
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"hide": 0,
|
||||
"current": {{
|
||||
"selected": true,
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
}}
|
||||
}}
|
||||
]
|
||||
}},
|
||||
"title": "Tenant Namespace Overview",
|
||||
"version": 1
|
||||
}}"#
|
||||
);
|
||||
dashboard
|
||||
}
|
@ -4,6 +4,7 @@ pub mod crd_default_rules;
|
||||
pub mod crd_grafana;
|
||||
pub mod crd_prometheus_rules;
|
||||
pub mod crd_prometheuses;
|
||||
pub mod grafana_default_dashboard;
|
||||
pub mod grafana_operator;
|
||||
pub mod prometheus_operator;
|
||||
pub mod role;
|
||||
|
@ -58,6 +58,7 @@ config:
|
||||
# web-root: "disable"
|
||||
enable-signup: false
|
||||
enable-login: "true"
|
||||
enable-metrics: "true"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
|
23
harmony/src/modules/prometheus/alerts/k8s/deployment.rs
Normal file
23
harmony/src/modules/prometheus/alerts/k8s/deployment.rs
Normal file
@ -0,0 +1,23 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "DeploymentUnavailable".into(),
|
||||
expr: "kube_deployment_status_replicas_unavailable > 0".into(),
|
||||
r#for: Some("2m".into()),
|
||||
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||
annotations: HashMap::from([
|
||||
(
|
||||
"summary".into(),
|
||||
"Deployment has unavailable replicas".into(),
|
||||
),
|
||||
(
|
||||
"description".into(),
|
||||
"A deployment in this namespace has unavailable replicas for over 2 minutes."
|
||||
.into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
37
harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
Normal file
37
harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
Normal file
@ -0,0 +1,37 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn alert_high_memory_usage() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "HighMemoryUsage".into(),
|
||||
expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
|
||||
.into(),
|
||||
r#for: Some("2m".into()),
|
||||
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||
annotations: HashMap::from([
|
||||
("summary".into(), "Pod is using high memory".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"A pod is consuming more than 500Mi of memory.".into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "HighCPUUsage".into(),
|
||||
expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
|
||||
.into(),
|
||||
r#for: Some("1m".into()),
|
||||
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||
annotations: HashMap::from([
|
||||
("summary".into(), "Pod is using high CPU".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"A pod is using more than 90% of a core over 1 minute.".into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
@ -1 +1,5 @@
|
||||
pub mod deployment;
|
||||
pub mod memory_usage;
|
||||
pub mod pod;
|
||||
pub mod pvc;
|
||||
pub mod service;
|
||||
|
55
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
55
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
@ -0,0 +1,55 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn pod_failed() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "PodFailed".into(),
|
||||
expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
|
||||
r#for: Some("2m".into()),
|
||||
labels: HashMap::from([("severity".into(), "critical".into())]),
|
||||
annotations: HashMap::from([
|
||||
("summary".into(), "A pod has failed".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"One or more pods are in Failed phase.".into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alert_container_restarting() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "ContainerRestarting".into(),
|
||||
expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
|
||||
r#for: Some("5m".into()),
|
||||
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||
annotations: HashMap::from([
|
||||
(
|
||||
"summary".into(),
|
||||
"Container is restarting frequently".into(),
|
||||
),
|
||||
(
|
||||
"description".into(),
|
||||
"A container in this namespace has restarted more than 3 times in 5 minutes."
|
||||
.into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alert_pod_not_ready() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "PodNotReady".into(),
|
||||
expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
|
||||
r#for: Some("2m".into()),
|
||||
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||
annotations: HashMap::from([
|
||||
("summary".into(), "Pod is not ready".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"A pod in the namespace is not reporting Ready status.".into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
19
harmony/src/modules/prometheus/alerts/k8s/service.rs
Normal file
19
harmony/src/modules/prometheus/alerts/k8s/service.rs
Normal file
@ -0,0 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn alert_service_down() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule {
|
||||
alert: "ServiceDown".into(),
|
||||
expr: "up == 0".into(),
|
||||
r#for: Some("1m".into()),
|
||||
labels: HashMap::from([("severity".into(), "critical".into())]),
|
||||
annotations: HashMap::from([
|
||||
("summary".into(), "Service is down".into()),
|
||||
(
|
||||
"description".into(),
|
||||
"A target service in the namespace is not responding to Prometheus scrapes.".into(),
|
||||
),
|
||||
]),
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user