feat: added default prometheus rules and grafana dashboard for application monitoring
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
This commit is contained in:
parent
1d8b503bd2
commit
b9e208f4cf
@ -1,10 +1,8 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::modules::monitoring::application_monitoring::helm_prometheus_application_alerting::HelmPrometheusApplicationAlertingScore;
|
use crate::modules::monitoring::application_monitoring::crd_application_monitoring_alerting::CRDApplicationAlertingScore;
|
||||||
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver;
|
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDAlertManagerReceiver;
|
||||||
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::{
|
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
|
||||||
build_rule_container_restarting, build_rule_pod_failed,
|
|
||||||
};
|
|
||||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup;
|
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup;
|
||||||
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
|
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
|
||||||
ServiceMonitor, ServiceMonitorSpec,
|
ServiceMonitor, ServiceMonitorSpec,
|
||||||
@ -39,7 +37,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
|||||||
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
|
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
|
||||||
info!("Ensuring monitoring is available for application");
|
info!("Ensuring monitoring is available for application");
|
||||||
let namespace = self.application.name().clone();
|
let namespace = self.application.name().clone();
|
||||||
let mut alerting_score = HelmPrometheusApplicationAlertingScore {
|
let mut alerting_score = CRDApplicationAlertingScore {
|
||||||
namespace: namespace.clone(),
|
namespace: namespace.clone(),
|
||||||
receivers: self.alert_receiver.clone(),
|
receivers: self.alert_receiver.clone(),
|
||||||
service_monitors: self.service_monitors.clone(),
|
service_monitors: self.service_monitors.clone(),
|
||||||
@ -92,9 +90,7 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
|||||||
|
|
||||||
alerting_score.receivers.push(Box::new(ntfy_receiver));
|
alerting_score.receivers.push(Box::new(ntfy_receiver));
|
||||||
|
|
||||||
//TODO add service monitors to PrometheusApplicationMonitoring which can be
|
let service_monitor = ServiceMonitor {
|
||||||
//deployed for the namespace using prometheus crd-servicemonitors
|
|
||||||
let mut service_monitor = ServiceMonitor {
|
|
||||||
metadata: ObjectMeta {
|
metadata: ObjectMeta {
|
||||||
name: Some(self.application.name().clone()),
|
name: Some(self.application.name().clone()),
|
||||||
labels: Some(std::collections::BTreeMap::from([
|
labels: Some(std::collections::BTreeMap::from([
|
||||||
@ -110,22 +106,12 @@ impl<T: Topology + HelmCommand + 'static + TenantManager + K8sclient + std::fmt:
|
|||||||
},
|
},
|
||||||
spec: ServiceMonitorSpec::default(),
|
spec: ServiceMonitorSpec::default(),
|
||||||
};
|
};
|
||||||
let service_mon_endpoint = ServiceMonitorEndpoint {
|
|
||||||
port: Some("http".into()),
|
|
||||||
interval: Some("30s".into()),
|
|
||||||
path: Some("/metrics".into()),
|
|
||||||
scheme: None,
|
|
||||||
relabelings: vec![],
|
|
||||||
metric_relabelings: vec![],
|
|
||||||
};
|
|
||||||
|
|
||||||
service_monitor.spec.endpoints.push(service_mon_endpoint);
|
|
||||||
|
|
||||||
alerting_score.service_monitors.push(service_monitor);
|
alerting_score.service_monitors.push(service_monitor);
|
||||||
|
|
||||||
let rules_group = RuleGroup {
|
let rules_group = RuleGroup {
|
||||||
name: format!("{}-rules", self.application.name().clone()),
|
name: format!("{}-rules", self.application.name().clone()),
|
||||||
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
|
rules: build_default_application_rules(),
|
||||||
};
|
};
|
||||||
|
|
||||||
alerting_score.prometheus_rules.push(rules_group);
|
alerting_score.prometheus_rules.push(rules_group);
|
||||||
|
|||||||
@ -19,6 +19,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
|
|||||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||||
};
|
};
|
||||||
|
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
|
||||||
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor;
|
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::ServiceMonitor;
|
||||||
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
|
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
|
||||||
use crate::{
|
use crate::{
|
||||||
@ -37,16 +38,16 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
#[derive(Clone, Debug, Serialize)]
|
||||||
pub struct HelmPrometheusApplicationAlertingScore {
|
pub struct CRDApplicationAlertingScore {
|
||||||
pub namespace: String,
|
pub namespace: String,
|
||||||
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
||||||
pub service_monitors: Vec<ServiceMonitor>,
|
pub service_monitors: Vec<ServiceMonitor>,
|
||||||
pub prometheus_rules: Vec<RuleGroup>,
|
pub prometheus_rules: Vec<RuleGroup>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScore {
|
impl<T: Topology + K8sclient> Score<T> for CRDApplicationAlertingScore {
|
||||||
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
||||||
Box::new(HelmPrometheusApplicationAlertingInterpret {
|
Box::new(CRDApplicationAlertingInterpret {
|
||||||
namespace: self.namespace.clone(),
|
namespace: self.namespace.clone(),
|
||||||
receivers: self.receivers.clone(),
|
receivers: self.receivers.clone(),
|
||||||
service_monitors: self.service_monitors.clone(),
|
service_monitors: self.service_monitors.clone(),
|
||||||
@ -55,12 +56,12 @@ impl<T: Topology + K8sclient> Score<T> for HelmPrometheusApplicationAlertingScor
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn name(&self) -> String {
|
fn name(&self) -> String {
|
||||||
"HelmPrometheusApplicationAlertingScore".into()
|
"CRDApplicationAlertingScore".into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct HelmPrometheusApplicationAlertingInterpret {
|
pub struct CRDApplicationAlertingInterpret {
|
||||||
pub namespace: String,
|
pub namespace: String,
|
||||||
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
pub receivers: Vec<Box<dyn CRDAlertManagerReceiver>>,
|
||||||
pub service_monitors: Vec<ServiceMonitor>,
|
pub service_monitors: Vec<ServiceMonitor>,
|
||||||
@ -68,7 +69,7 @@ pub struct HelmPrometheusApplicationAlertingInterpret {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlertingInterpret {
|
impl<T: Topology + K8sclient> Interpret<T> for CRDApplicationAlertingInterpret {
|
||||||
async fn execute(
|
async fn execute(
|
||||||
&self,
|
&self,
|
||||||
_inventory: &Inventory,
|
_inventory: &Inventory,
|
||||||
@ -85,7 +86,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
|
|||||||
self.install_monitors(self.service_monitors.clone(), &client)
|
self.install_monitors(self.service_monitors.clone(), &client)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(Outcome::success(format!(
|
Ok(Outcome::success(format!(
|
||||||
"deployed application monitoring composants channels"
|
"deployed application monitoring composants"
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,7 +107,7 @@ impl<T: Topology + K8sclient> Interpret<T> for HelmPrometheusApplicationAlerting
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HelmPrometheusApplicationAlertingInterpret {
|
impl CRDApplicationAlertingInterpret {
|
||||||
async fn crd_exists(&self, crd: &str) -> bool {
|
async fn crd_exists(&self, crd: &str) -> bool {
|
||||||
let output = Command::new("kubectl")
|
let output = Command::new("kubectl")
|
||||||
.args(["get", "crd", crd])
|
.args(["get", "crd", crd])
|
||||||
@ -428,41 +429,11 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
|||||||
json_data.insert("timeInterval".to_string(), "5s".to_string());
|
json_data.insert("timeInterval".to_string(), "5s".to_string());
|
||||||
let namespace = self.namespace.clone();
|
let namespace = self.namespace.clone();
|
||||||
|
|
||||||
let json = format!(
|
let json = build_default_dashboard(&namespace);
|
||||||
r#"{{
|
|
||||||
"title": "UP Status Dashboard",
|
|
||||||
"timezone": "browser",
|
|
||||||
"panels": [
|
|
||||||
{{
|
|
||||||
"type": "table",
|
|
||||||
"title": "Service UP Status",
|
|
||||||
"gridPos": {{ "x": 0, "y": 0, "w": 24, "h": 10 }},
|
|
||||||
"targets": [
|
|
||||||
{{
|
|
||||||
"expr": "up{{namespace=\"{namespace}\"}}",
|
|
||||||
"format": "table",
|
|
||||||
"refId": "A"
|
|
||||||
}}
|
|
||||||
],
|
|
||||||
"options": {{
|
|
||||||
"showHeader": true
|
|
||||||
}},
|
|
||||||
"fieldConfig": {{
|
|
||||||
"defaults": {{
|
|
||||||
"custom": {{}}
|
|
||||||
}},
|
|
||||||
"overrides": []
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
],
|
|
||||||
"schemaVersion": 30,
|
|
||||||
"version": 1
|
|
||||||
}}"#
|
|
||||||
);
|
|
||||||
|
|
||||||
let graf_data_source = GrafanaDatasource {
|
let graf_data_source = GrafanaDatasource {
|
||||||
metadata: ObjectMeta {
|
metadata: ObjectMeta {
|
||||||
name: Some(self.namespace.clone()),
|
name: Some(format!("grafana-datasource-{}", self.namespace.clone())),
|
||||||
namespace: Some(self.namespace.clone()),
|
namespace: Some(self.namespace.clone()),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
},
|
},
|
||||||
@ -491,7 +462,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
|||||||
|
|
||||||
let graf_dashboard = GrafanaDashboard {
|
let graf_dashboard = GrafanaDashboard {
|
||||||
metadata: ObjectMeta {
|
metadata: ObjectMeta {
|
||||||
name: Some(self.namespace.clone()),
|
name: Some(format!("grafana-dashboard-{}", self.namespace.clone())),
|
||||||
namespace: Some(self.namespace.clone()),
|
namespace: Some(self.namespace.clone()),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
},
|
},
|
||||||
@ -509,7 +480,7 @@ impl HelmPrometheusApplicationAlertingInterpret {
|
|||||||
|
|
||||||
let grafana = Grafana {
|
let grafana = Grafana {
|
||||||
metadata: ObjectMeta {
|
metadata: ObjectMeta {
|
||||||
name: Some(self.namespace.clone()),
|
name: Some(format!("grafana-{}", self.namespace.clone())),
|
||||||
namespace: Some(self.namespace.clone()),
|
namespace: Some(self.namespace.clone()),
|
||||||
labels: Some(label.clone()),
|
labels: Some(label.clone()),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@ -1,2 +1,2 @@
|
|||||||
pub mod helm_prometheus_application_alerting;
|
pub mod crd_application_monitoring_alerting;
|
||||||
pub mod k8s_application_monitoring_score;
|
pub mod k8s_application_monitoring_score;
|
||||||
|
|||||||
@ -1,38 +1,30 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use crate::modules::{
|
||||||
|
monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
|
||||||
|
prometheus::alerts::k8s::{
|
||||||
|
deployment::alert_deployment_unavailable,
|
||||||
|
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
|
||||||
|
pvc::high_pvc_fill_rate_over_two_days,
|
||||||
|
service::alert_service_down,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
use super::crd_prometheus_rules::Rule;
|
use super::crd_prometheus_rules::Rule;
|
||||||
|
|
||||||
pub fn build_rule_container_restarting() -> Rule {
|
pub fn build_default_application_rules() -> Vec<Rule> {
|
||||||
Rule {
|
let pod_failed: Rule = pod_failed().into();
|
||||||
alert: Some("ContainerRestarting".into()),
|
let container_restarting: Rule = alert_container_restarting().into();
|
||||||
expr: Some("increase(kube_pod_container_status_restarts_total[5m]) > 3".into()),
|
let pod_not_ready: Rule = alert_pod_not_ready().into();
|
||||||
for_: Some("5m".into()),
|
let service_down: Rule = alert_service_down().into();
|
||||||
labels: Some(BTreeMap::from([("severity".into(), "warning".into())])),
|
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
|
||||||
annotations: Some(BTreeMap::from([
|
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
|
||||||
(
|
vec![
|
||||||
"summary".into(),
|
pod_failed,
|
||||||
"Container is restarting frequently".into(),
|
container_restarting,
|
||||||
),
|
pod_not_ready,
|
||||||
(
|
service_down,
|
||||||
"description".into(),
|
deployment_unavailable,
|
||||||
"Container in this namespace is restarting more than 3 times in 5 minutes.".into(),
|
high_pvc_fill_rate,
|
||||||
),
|
]
|
||||||
])),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build_rule_pod_failed() -> Rule {
|
|
||||||
Rule {
|
|
||||||
alert: Some("PodFailed".into()),
|
|
||||||
expr: Some("kube_pod_status_phase{phase=\"Failed\"} > 0".into()),
|
|
||||||
for_: Some("0m".into()),
|
|
||||||
labels: Some(BTreeMap::from([("severity".into(), "critical".into())])),
|
|
||||||
annotations: Some(BTreeMap::from([
|
|
||||||
("summary".into(), "A pod has failed".into()),
|
|
||||||
(
|
|
||||||
"description".into(),
|
|
||||||
"One or more pods are in Failed phase.".into(),
|
|
||||||
),
|
|
||||||
])),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,8 +4,6 @@ use kube::CustomResource;
|
|||||||
use schemars::JsonSchema;
|
use schemars::JsonSchema;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::modules::monitoring::kube_prometheus::types::Operator;
|
|
||||||
|
|
||||||
use super::crd_prometheuses::LabelSelector;
|
use super::crd_prometheuses::LabelSelector;
|
||||||
|
|
||||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||||
|
|||||||
@ -1,8 +1,12 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use kube::CustomResource;
|
use kube::CustomResource;
|
||||||
use schemars::JsonSchema;
|
use schemars::JsonSchema;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::crd_default_rules::{build_rule_container_restarting, build_rule_pod_failed};
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
use super::crd_default_rules::build_default_application_rules;
|
||||||
|
|
||||||
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
|
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
|
||||||
#[kube(
|
#[kube(
|
||||||
@ -42,13 +46,14 @@ pub struct Rule {
|
|||||||
pub annotations: Option<std::collections::BTreeMap<String, String>>,
|
pub annotations: Option<std::collections::BTreeMap<String, String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PrometheusRuleSpec {
|
impl From<PrometheusAlertRule> for Rule {
|
||||||
pub fn with_default_rules() -> Self {
|
fn from(value: PrometheusAlertRule) -> Self {
|
||||||
PrometheusRuleSpec {
|
Rule {
|
||||||
groups: vec![RuleGroup {
|
alert: Some(value.alert),
|
||||||
name: "default.rules".into(),
|
expr: Some(value.expr),
|
||||||
rules: vec![build_rule_container_restarting(), build_rule_pod_failed()],
|
for_: value.r#for,
|
||||||
}],
|
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
|
||||||
|
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,203 @@
|
|||||||
|
pub fn build_default_dashboard(namespace: &str) -> String {
|
||||||
|
let dashboard = format!(
|
||||||
|
r#"{{
|
||||||
|
"annotations": {{
|
||||||
|
"list": []
|
||||||
|
}},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"iteration": 171105,
|
||||||
|
"panels": [
|
||||||
|
{{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {{
|
||||||
|
"defaults": {{
|
||||||
|
"unit": "short"
|
||||||
|
}},
|
||||||
|
"overrides": []
|
||||||
|
}},
|
||||||
|
"gridPos": {{
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}},
|
||||||
|
"id": 1,
|
||||||
|
"options": {{
|
||||||
|
"reduceOptions": {{
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"pluginVersion": "9.0.0",
|
||||||
|
"targets": [
|
||||||
|
{{
|
||||||
|
"expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})",
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "A"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"title": "Pods in Namespace",
|
||||||
|
"type": "stat"
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {{
|
||||||
|
"defaults": {{
|
||||||
|
"unit": "short"
|
||||||
|
}},
|
||||||
|
"overrides": []
|
||||||
|
}},
|
||||||
|
"gridPos": {{
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 6,
|
||||||
|
"y": 0
|
||||||
|
}},
|
||||||
|
"id": 2,
|
||||||
|
"options": {{
|
||||||
|
"reduceOptions": {{
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"pluginVersion": "9.0.0",
|
||||||
|
"targets": [
|
||||||
|
{{
|
||||||
|
"expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "A"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"title": "Pods in Failed State",
|
||||||
|
"type": "stat"
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {{
|
||||||
|
"defaults": {{
|
||||||
|
"unit": "percentunit"
|
||||||
|
}},
|
||||||
|
"overrides": []
|
||||||
|
}},
|
||||||
|
"gridPos": {{
|
||||||
|
"h": 6,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 6
|
||||||
|
}},
|
||||||
|
"id": 3,
|
||||||
|
"options": {{
|
||||||
|
"reduceOptions": {{
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"pluginVersion": "9.0.0",
|
||||||
|
"targets": [
|
||||||
|
{{
|
||||||
|
"expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "A"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"title": "Deployment Health (Available / Desired)",
|
||||||
|
"type": "stat"
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {{
|
||||||
|
"defaults": {{
|
||||||
|
"unit": "short"
|
||||||
|
}},
|
||||||
|
"overrides": []
|
||||||
|
}},
|
||||||
|
"gridPos": {{
|
||||||
|
"h": 6,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 12
|
||||||
|
}},
|
||||||
|
"id": 4,
|
||||||
|
"options": {{
|
||||||
|
"reduceOptions": {{
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"pluginVersion": "9.0.0",
|
||||||
|
"targets": [
|
||||||
|
{{
|
||||||
|
"expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
|
||||||
|
"legendFormat": "{{{{pod}}}}",
|
||||||
|
"refId": "A"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"title": "Container Restarts (per pod)",
|
||||||
|
"type": "timeseries"
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"datasource": "$datasource",
|
||||||
|
"fieldConfig": {{
|
||||||
|
"defaults": {{
|
||||||
|
"unit": "short"
|
||||||
|
}},
|
||||||
|
"overrides": []
|
||||||
|
}},
|
||||||
|
"gridPos": {{
|
||||||
|
"h": 6,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 18
|
||||||
|
}},
|
||||||
|
"id": 5,
|
||||||
|
"options": {{
|
||||||
|
"reduceOptions": {{
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"pluginVersion": "9.0.0",
|
||||||
|
"targets": [
|
||||||
|
{{
|
||||||
|
"expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
|
||||||
|
"legendFormat": "",
|
||||||
|
"refId": "A"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"title": "Firing Alerts in Namespace",
|
||||||
|
"type": "stat"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"templating": {{
|
||||||
|
"list": [
|
||||||
|
{{
|
||||||
|
"name": "datasource",
|
||||||
|
"type": "datasource",
|
||||||
|
"pluginId": "prometheus",
|
||||||
|
"label": "Prometheus",
|
||||||
|
"query": "prometheus",
|
||||||
|
"refresh": 1,
|
||||||
|
"hide": 0,
|
||||||
|
"current": {{
|
||||||
|
"selected": true,
|
||||||
|
"text": "Prometheus",
|
||||||
|
"value": "Prometheus"
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}},
|
||||||
|
"title": "Tenant Namespace Overview",
|
||||||
|
"version": 1
|
||||||
|
}}"#
|
||||||
|
);
|
||||||
|
dashboard
|
||||||
|
}
|
||||||
@ -4,6 +4,7 @@ pub mod crd_default_rules;
|
|||||||
pub mod crd_grafana;
|
pub mod crd_grafana;
|
||||||
pub mod crd_prometheus_rules;
|
pub mod crd_prometheus_rules;
|
||||||
pub mod crd_prometheuses;
|
pub mod crd_prometheuses;
|
||||||
|
pub mod grafana_default_dashboard;
|
||||||
pub mod grafana_operator;
|
pub mod grafana_operator;
|
||||||
pub mod prometheus_operator;
|
pub mod prometheus_operator;
|
||||||
pub mod role;
|
pub mod role;
|
||||||
|
|||||||
@ -58,6 +58,7 @@ config:
|
|||||||
# web-root: "disable"
|
# web-root: "disable"
|
||||||
enable-signup: false
|
enable-signup: false
|
||||||
enable-login: "true"
|
enable-login: "true"
|
||||||
|
enable-metrics: "true"
|
||||||
|
|
||||||
persistence:
|
persistence:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
23
harmony/src/modules/prometheus/alerts/k8s/deployment.rs
Normal file
23
harmony/src/modules/prometheus/alerts/k8s/deployment.rs
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "DeploymentUnavailable".into(),
|
||||||
|
expr: "kube_deployment_status_replicas_unavailable > 0".into(),
|
||||||
|
r#for: Some("2m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
(
|
||||||
|
"summary".into(),
|
||||||
|
"Deployment has unavailable replicas".into(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A deployment in this namespace has unavailable replicas for over 2 minutes."
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
37
harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
Normal file
37
harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn alert_high_memory_usage() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "HighMemoryUsage".into(),
|
||||||
|
expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
|
||||||
|
.into(),
|
||||||
|
r#for: Some("2m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
("summary".into(), "Pod is using high memory".into()),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A pod is consuming more than 500Mi of memory.".into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "HighCPUUsage".into(),
|
||||||
|
expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
|
||||||
|
.into(),
|
||||||
|
r#for: Some("1m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
("summary".into(), "Pod is using high CPU".into()),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A pod is using more than 90% of a core over 1 minute.".into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1 +1,5 @@
|
|||||||
|
pub mod deployment;
|
||||||
|
pub mod memory_usage;
|
||||||
|
pub mod pod;
|
||||||
pub mod pvc;
|
pub mod pvc;
|
||||||
|
pub mod service;
|
||||||
|
|||||||
55
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
55
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn pod_failed() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "PodFailed".into(),
|
||||||
|
expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
|
||||||
|
r#for: Some("2m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "critical".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
("summary".into(), "A pod has failed".into()),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"One or more pods are in Failed phase.".into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alert_container_restarting() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "ContainerRestarting".into(),
|
||||||
|
expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
|
||||||
|
r#for: Some("5m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
(
|
||||||
|
"summary".into(),
|
||||||
|
"Container is restarting frequently".into(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A container in this namespace has restarted more than 3 times in 5 minutes."
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn alert_pod_not_ready() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "PodNotReady".into(),
|
||||||
|
expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
|
||||||
|
r#for: Some("2m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "warning".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
("summary".into(), "Pod is not ready".into()),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A pod in the namespace is not reporting Ready status.".into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
19
harmony/src/modules/prometheus/alerts/k8s/service.rs
Normal file
19
harmony/src/modules/prometheus/alerts/k8s/service.rs
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn alert_service_down() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule {
|
||||||
|
alert: "ServiceDown".into(),
|
||||||
|
expr: "up == 0".into(),
|
||||||
|
r#for: Some("1m".into()),
|
||||||
|
labels: HashMap::from([("severity".into(), "critical".into())]),
|
||||||
|
annotations: HashMap::from([
|
||||||
|
("summary".into(), "Service is down".into()),
|
||||||
|
(
|
||||||
|
"description".into(),
|
||||||
|
"A target service in the namespace is not responding to Prometheus scrapes.".into(),
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user