feat(monitoring): Ceph alerts integrated with OKD's native alerting stack #265
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -2873,6 +2873,17 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "example-okd-ceph-alerts"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"harmony",
|
||||||
|
"harmony_cli",
|
||||||
|
"harmony_types",
|
||||||
|
"log",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "example-okd-cluster-alerts"
|
name = "example-okd-cluster-alerts"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|||||||
14
examples/okd_ceph_alerts/Cargo.toml
Normal file
14
examples/okd_ceph_alerts/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[package]
|
||||||
|
name = "example-okd-ceph-alerts"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
publish = false
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../../harmony" }
|
||||||
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
tokio = { workspace = true }
|
||||||
|
log = { workspace = true }
|
||||||
4
examples/okd_ceph_alerts/env.sh
Normal file
4
examples/okd_ceph_alerts/env.sh
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
export HARMONY_SECRET_NAMESPACE=okd_ceph_alerts_example
|
||||||
|
export HARMONY_SECRET_STORE=file
|
||||||
|
export HARMONY_DATABASE_URL=sqlite://harmony_okd_ceph_alerts_example.sqlite
|
||||||
|
export RUST_LOG=harmony=debug
|
||||||
28
examples/okd_ceph_alerts/src/main.rs
Normal file
28
examples/okd_ceph_alerts/src/main.rs
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
use harmony::{
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::monitoring::{
|
||||||
|
ceph_alerts::ceph_alert_rule_groups, okd::cluster_alert_rules::OpenshiftPrometheusRuleScore,
|
||||||
|
},
|
||||||
|
topology::K8sAnywhereTopology,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
harmony_cli::cli_logger::init();
|
||||||
|
|
||||||
|
let ceph_rules = OpenshiftPrometheusRuleScore {
|
||||||
|
namespace: "rook-ceph".to_string(),
|
||||||
|
name: "ceph-alerts".to_string(),
|
||||||
|
rule_groups: ceph_alert_rule_groups(),
|
||||||
|
labels: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
harmony_cli::run(
|
||||||
|
Inventory::autoload(),
|
||||||
|
K8sAnywhereTopology::from_env(),
|
||||||
|
vec![Box::new(ceph_rules)],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
167
harmony/src/modules/monitoring/ceph_alerts.rs
Normal file
167
harmony/src/modules/monitoring/ceph_alerts.rs
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{Rule, RuleGroup};
|
||||||
|
|
||||||
|
pub fn ceph_alert_rule_groups() -> Vec<RuleGroup> {
|
||||||
|
vec![
|
||||||
|
RuleGroup {
|
||||||
|
name: "ceph-cluster-health".to_string(),
|
||||||
|
rules: vec![
|
||||||
|
alert(
|
||||||
|
"CephHealthWarn",
|
||||||
|
"max(ceph_health_status) == 1",
|
||||||
|
Some("15m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph cluster health is WARN",
|
||||||
|
"Ceph reports HEALTH_WARN for more than 15 minutes. Run `ceph -s` or check the Ceph dashboard to see active health checks.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephHealthErr",
|
||||||
|
"max(ceph_health_status) == 2",
|
||||||
|
Some("5m"),
|
||||||
|
"critical",
|
||||||
|
"Ceph cluster health is ERR",
|
||||||
|
"Ceph reports HEALTH_ERR for more than 5 minutes. Immediate investigation required.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephMonDown",
|
||||||
|
"count(max by (ceph_daemon) (ceph_mon_quorum_status == 0)) > 0",
|
||||||
|
Some("5m"),
|
||||||
|
"critical",
|
||||||
|
"Ceph monitor is out of quorum",
|
||||||
|
"One or more Ceph monitors are not in quorum. Quorum loss risks cluster availability.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephMgrAbsent",
|
||||||
|
"sum(max by (ceph_daemon) (ceph_mgr_status)) < 1",
|
||||||
|
Some("5m"),
|
||||||
|
"critical",
|
||||||
|
"No active Ceph manager",
|
||||||
|
"No Ceph manager daemon is currently active. Dashboards and orchestration will be unavailable.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
RuleGroup {
|
||||||
|
name: "ceph-osd".to_string(),
|
||||||
|
rules: vec![
|
||||||
|
alert(
|
||||||
|
"CephOSDDown",
|
||||||
|
"count(max by (ceph_daemon) (ceph_osd_up == 0)) > 0",
|
||||||
|
Some("5m"),
|
||||||
|
"warning",
|
||||||
|
"One or more Ceph OSDs are down",
|
||||||
|
"At least one OSD daemon is reporting down for 5 minutes. Data redundancy may be reduced.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephOSDNearFull",
|
||||||
|
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 80",
|
||||||
|
Some("15m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph OSD is near full",
|
||||||
|
"OSD {{ $labels.ceph_daemon }} is above 80% utilization. Rebalance or add capacity.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephOSDFull",
|
||||||
|
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 90",
|
||||||
|
Some("5m"),
|
||||||
|
"critical",
|
||||||
|
"Ceph OSD is critically full",
|
||||||
|
"OSD {{ $labels.ceph_daemon }} is above 90% utilization. Writes may block. Act immediately.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
RuleGroup {
|
||||||
|
name: "ceph-capacity".to_string(),
|
||||||
|
rules: vec![
|
||||||
|
alert(
|
||||||
|
"CephClusterNearFull",
|
||||||
|
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 75",
|
||||||
|
Some("15m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph cluster is near full",
|
||||||
|
"Cluster raw utilization is above 75% for 15 minutes.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephClusterCriticallyFull",
|
||||||
|
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 85",
|
||||||
|
Some("5m"),
|
||||||
|
"critical",
|
||||||
|
"Ceph cluster is critically full",
|
||||||
|
"Cluster raw utilization is above 85%. Imminent risk of write unavailability.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephPoolNearFull",
|
||||||
|
"100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail)) > 80",
|
||||||
|
Some("15m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph pool is near full",
|
||||||
|
"Pool (pool_id {{ $labels.pool_id }}) is above 80% usage.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephDaysUntilFull",
|
||||||
|
"(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)) / clamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1) / 86400 < 30",
|
||||||
|
Some("1h"),
|
||||||
|
"warning",
|
||||||
|
"Ceph cluster predicted to fill within 30 days",
|
||||||
|
"Based on the 7-day usage trend, the cluster will reach capacity in less than 30 days.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
RuleGroup {
|
||||||
|
name: "ceph-placement-groups".to_string(),
|
||||||
|
rules: vec![
|
||||||
|
alert(
|
||||||
|
"CephPGsNotActiveClean",
|
||||||
|
"max(ceph_pg_total) - max(ceph_pg_clean) > 0",
|
||||||
|
Some("15m"),
|
||||||
|
"warning",
|
||||||
|
"Some placement groups are not active+clean",
|
||||||
|
"{{ $value }} PGs have been in a non-clean state for more than 15 minutes.",
|
||||||
|
),
|
||||||
|
alert(
|
||||||
|
"CephSlowOps",
|
||||||
|
"max(ceph_healthcheck_slow_ops) > 0",
|
||||||
|
Some("5m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph reports slow ops",
|
||||||
|
"Ceph has {{ $value }} slow operations outstanding for more than 5 minutes.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
RuleGroup {
|
||||||
|
name: "ceph-nodes".to_string(),
|
||||||
|
rules: vec![alert(
|
||||||
|
"CephNodeRootDiskUsage",
|
||||||
|
"100 * (1 - (max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}) / max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}))) > 85",
|
||||||
|
Some("10m"),
|
||||||
|
"warning",
|
||||||
|
"Ceph node root/var disk above 85%",
|
||||||
|
"Node {{ $labels.instance }} mountpoint {{ $labels.mountpoint }} is above 85% disk usage. OSDs on this node may be at risk.",
|
||||||
|
)],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn alert(
|
||||||
|
name: &str,
|
||||||
|
expr: &str,
|
||||||
|
for_: Option<&str>,
|
||||||
|
severity: &str,
|
||||||
|
summary: &str,
|
||||||
|
description: &str,
|
||||||
|
) -> Rule {
|
||||||
|
let mut labels = BTreeMap::new();
|
||||||
|
labels.insert("severity".to_string(), severity.to_string());
|
||||||
|
|
||||||
|
let mut annotations = BTreeMap::new();
|
||||||
|
annotations.insert("summary".to_string(), summary.to_string());
|
||||||
|
annotations.insert("description".to_string(), description.to_string());
|
||||||
|
|
||||||
|
Rule {
|
||||||
|
alert: Some(name.to_string()),
|
||||||
|
expr: Some(expr.to_string()),
|
||||||
|
for_: for_.map(|s| s.to_string()),
|
||||||
|
labels: Some(labels),
|
||||||
|
annotations: Some(annotations),
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,674 @@
|
|||||||
|
{
|
||||||
|
"title": "Ceph Cluster",
|
||||||
|
"uid": "ceph-cluster",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "pool",
|
||||||
|
"type": "query",
|
||||||
|
"label": "Pool",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(ceph_pool_metadata, name)", "refId": "Pool" },
|
||||||
|
"definition": "label_values(ceph_pool_metadata, name)",
|
||||||
|
"multi": true,
|
||||||
|
"includeAll": true,
|
||||||
|
"current": { "text": "All", "value": "$__all", "selected": false },
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "osd",
|
||||||
|
"type": "query",
|
||||||
|
"label": "OSD",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(ceph_osd_metadata, ceph_daemon)", "refId": "OSD" },
|
||||||
|
"definition": "label_values(ceph_osd_metadata, ceph_daemon)",
|
||||||
|
"multi": true,
|
||||||
|
"includeAll": true,
|
||||||
|
"current": { "text": "All", "value": "$__all", "selected": false },
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 1, "title": "Cluster Status", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 2, "title": "Health",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "max(ceph_health_status)", "refId": "A" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 2 }
|
||||||
|
]},
|
||||||
|
"mappings": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||||
|
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||||
|
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "value"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 3, "title": "Mon Quorum",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "count(max by (ceph_daemon) (ceph_mon_quorum_status == 1)) or vector(0)", "refId": "A", "legendFormat": "In Quorum" },
|
||||||
|
{ "expr": "count(max by (ceph_daemon) (ceph_mon_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 4, "title": "MGR Active",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(max by (ceph_daemon) (ceph_mgr_status)) or vector(0)", "refId": "A" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 3, "x": 8, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 5, "title": "OSDs Up / In / Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" },
|
||||||
|
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_in)) or vector(0)", "refId": "B", "legendFormat": "In" },
|
||||||
|
{ "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "C", "legendFormat": "Total" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 5, "x": 11, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 6, "title": "Pools",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(max by (pool_id) (ceph_pool_metadata)) or vector(0)", "refId": "A" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 3, "x": 16, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 7, "title": "PGs Active+Clean / Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max(ceph_pg_clean) or vector(0)", "refId": "A", "legendFormat": "Active+Clean" },
|
||||||
|
{ "expr": "max(ceph_pg_total) or vector(0)", "refId": "B", "legendFormat": "Total" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 5, "x": 19, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 8, "title": "Capacity", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "gauge", "id": 9, "title": "Cluster Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)",
|
||||||
|
"refId": "A"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"showThresholdLabels": true, "showThresholdMarkers": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 10, "title": "Total / Used / Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
|
||||||
|
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" },
|
||||||
|
{ "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "C", "legendFormat": "Available" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 11, "title": "Capacity Over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
|
||||||
|
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 12, "title": "Days Until Full (predicted, 7d trend)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes))\n/\nclamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1)\n/ 86400",
|
||||||
|
"refId": "A"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "d",
|
||||||
|
"decimals": 1,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 14 },
|
||||||
|
{ "color": "green", "value": 60 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge", "id": 13, "title": "Pool Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "(\n 100 * max by (pool_id) (ceph_pool_bytes_used)\n /\n (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))\n)\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"instant": true
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"valueMode": "color",
|
||||||
|
"sortBy": "Value",
|
||||||
|
"sortOrder": "desc"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge", "id": 14, "title": "OSD Utilization (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{ceph_daemon}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 15, "title": "Performance", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 16, "title": "Cluster IOPS (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd[5m])))", "refId": "A", "legendFormat": "Read" },
|
||||||
|
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr[5m])))", "refId": "B", "legendFormat": "Write" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 17, "title": "Cluster Throughput (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd_bytes[5m])))", "refId": "A", "legendFormat": "Read" },
|
||||||
|
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr_bytes[5m])))", "refId": "B", "legendFormat": "Write" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 18, "title": "Client Op Latency (Avg)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(ceph_osd_op_r_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_r_latency_count[5m])), 1)",
|
||||||
|
"refId": "A", "legendFormat": "Read"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(ceph_osd_op_w_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_w_latency_count[5m])), 1)",
|
||||||
|
"refId": "B", "legendFormat": "Write"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 19, "title": "Recovery Throughput",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(rate(ceph_osd_recovery_bytes[5m])) or vector(0)", "refId": "A", "legendFormat": "Recovery B/s" },
|
||||||
|
{ "expr": "sum(rate(ceph_osd_recovery_ops[5m])) or vector(0)", "refId": "B", "legendFormat": "Recovery ops/s" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Recovery B/s" }, "properties": [{ "id": "unit", "value": "Bps" }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 21, "title": "PG States Over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max(ceph_pg_clean)", "refId": "A", "legendFormat": "clean" },
|
||||||
|
{ "expr": "max(ceph_pg_active)", "refId": "B", "legendFormat": "active" },
|
||||||
|
{ "expr": "max(ceph_pg_degraded)", "refId": "C", "legendFormat": "degraded" },
|
||||||
|
{ "expr": "max(ceph_pg_undersized)", "refId": "D", "legendFormat": "undersized" },
|
||||||
|
{ "expr": "max(ceph_pg_peering)", "refId": "E", "legendFormat": "peering" },
|
||||||
|
{ "expr": "max(ceph_pg_recovering)", "refId": "F", "legendFormat": "recovering" },
|
||||||
|
{ "expr": "max(ceph_pg_backfilling)", "refId": "G", "legendFormat": "backfilling" },
|
||||||
|
{ "expr": "max(ceph_pg_remapped)", "refId": "H", "legendFormat": "remapped" },
|
||||||
|
{ "expr": "max(ceph_pg_inconsistent)", "refId": "I", "legendFormat": "inconsistent" },
|
||||||
|
{ "expr": "max(ceph_pg_stale)", "refId": "J", "legendFormat": "stale" },
|
||||||
|
{ "expr": "max(ceph_pg_unknown)", "refId": "K", "legendFormat": "unknown" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 0 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right",
|
||||||
|
"calcs": ["max", "lastNotNull"],
|
||||||
|
"showLegend": true,
|
||||||
|
"sortBy": "Max",
|
||||||
|
"sortDesc": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 22, "title": "Slow Ops",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "max(ceph_healthcheck_slow_ops) or vector(0)", "refId": "A" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 10 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "area", "textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat", "id": 23, "title": "Misplaced / Degraded Objects",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max(ceph_num_objects_misplaced) or vector(0)", "refId": "A", "legendFormat": "Misplaced" },
|
||||||
|
{ "expr": "max(ceph_num_objects_degraded) or vector(0)", "refId": "B", "legendFormat": "Degraded" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 24, "title": "OSD Detail", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "table", "id": 25, "title": "OSDs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_up{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "Up", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_in{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "In", "format": "table", "instant": true },
|
||||||
|
{ "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", "refId": "C", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_numpg{ceph_daemon=~\"$osd\"})", "refId": "D", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "E", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "F", "format": "table", "instant": true }
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "merge" },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
|
||||||
|
"renameByName": {
|
||||||
|
"ceph_daemon": "OSD",
|
||||||
|
"Value #A": "Up",
|
||||||
|
"Value #B": "In",
|
||||||
|
"Value #C": "Util %",
|
||||||
|
"Value #D": "PGs",
|
||||||
|
"Value #E": "Apply Latency",
|
||||||
|
"Value #F": "Commit Latency"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"OSD": 0, "Up": 1, "In": 2, "Util %": 3, "PGs": 4, "Apply Latency": 5, "Commit Latency": 6
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Util %" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "percent" },
|
||||||
|
{ "id": "decimals", "value": 1 },
|
||||||
|
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Apply Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Commit Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "Up|In" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "mappings", "value": [{ "type": "value", "options": { "0": { "text": "✗", "index": 0 }, "1": { "text": "✓", "index": 1 }}}] },
|
||||||
|
{ "id": "custom.cellOptions", "value": { "type": "color-text" } },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 26, "title": "OSD Apply + Commit Latency",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "{{ceph_daemon}} apply" },
|
||||||
|
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "{{ceph_daemon}} commit" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ms",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 1, "fillOpacity": 0 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row", "id": 27, "title": "Pool Detail", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "table", "id": 28, "title": "Pools",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", "refId": "A", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (pool_id) (ceph_pool_objects)", "refId": "B", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (pool_id) (ceph_pool_bytes_used)", "refId": "C", "format": "table", "instant": true },
|
||||||
|
{ "expr": "max by (pool_id) (ceph_pool_max_avail)", "refId": "D", "format": "table", "instant": true },
|
||||||
|
{ "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "E", "format": "table", "instant": true }
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "merge" },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "Value #A": true },
|
||||||
|
"renameByName": {
|
||||||
|
"pool_id": "ID",
|
||||||
|
"name": "Pool",
|
||||||
|
"Value #B": "Objects",
|
||||||
|
"Value #C": "Used",
|
||||||
|
"Value #D": "Available",
|
||||||
|
"Value #E": "Used %"
|
||||||
|
},
|
||||||
|
"indexByName": { "ID": 0, "Pool": 1, "Objects": 2, "Used": 3, "Available": 4, "Used %": 5 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Used %" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "percent" },
|
||||||
|
{ "id": "decimals", "value": 1 },
|
||||||
|
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries", "id": 29, "title": "Pool IOPS (Read / Write) — filtered by $pool",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||||
|
"refId": "A", "legendFormat": "Read — {{name}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||||
|
"refId": "B", "legendFormat": "Write — {{name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right",
|
||||||
|
"calcs": ["max", "lastNotNull"],
|
||||||
|
"showLegend": true,
|
||||||
|
"sortBy": "Max",
|
||||||
|
"sortDesc": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -368,7 +368,7 @@
|
|||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
"expr": "100 * (1 - (\n sum(node_filesystem_avail_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum(node_filesystem_size_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Disk"
|
"legendFormat": "Disk"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -440,7 +440,7 @@
|
|||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
"expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{instance}}"
|
"legendFormat": "{{instance}}"
|
||||||
}
|
}
|
||||||
@@ -467,7 +467,7 @@
|
|||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
"expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{instance}}"
|
"legendFormat": "{{instance}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"title": "Storage Health",
|
"title": "Persistent Storage",
|
||||||
"uid": "storage-health",
|
"uid": "persistent-storage",
|
||||||
"schemaVersion": 36,
|
"schemaVersion": 36,
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"refresh": "30s",
|
"refresh": "30s",
|
||||||
@@ -21,25 +21,17 @@
|
|||||||
"title": "Bound PVCs",
|
"title": "Bound PVCs",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A" }
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"color": { "mode": "thresholds" },
|
"color": { "mode": "thresholds" },
|
||||||
"thresholds": {
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [{ "color": "green", "value": null }]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"colorMode": "background",
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||||
},
|
},
|
||||||
@@ -50,28 +42,19 @@
|
|||||||
"title": "Pending PVCs",
|
"title": "Pending PVCs",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "A" }
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"color": { "mode": "thresholds" },
|
"color": { "mode": "thresholds" },
|
||||||
"thresholds": {
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
"mode": "absolute",
|
{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }
|
||||||
"steps": [
|
]}
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "yellow", "value": 1 }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"colorMode": "background",
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||||
},
|
},
|
||||||
@@ -82,28 +65,19 @@
|
|||||||
"title": "Lost PVCs",
|
"title": "Lost PVCs",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "A" }
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"color": { "mode": "thresholds" },
|
"color": { "mode": "thresholds" },
|
||||||
"thresholds": {
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
"mode": "absolute",
|
{ "color": "green", "value": null }, { "color": "red", "value": 1 }
|
||||||
"steps": [
|
]}
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "red", "value": 1 }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"colorMode": "background",
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||||
},
|
},
|
||||||
@@ -114,201 +88,57 @@
|
|||||||
"title": "Bound PVs / Available PVs",
|
"title": "Bound PVs / Available PVs",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{ "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
|
||||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
{ "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", "refId": "B", "legendFormat": "Available" }
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "Bound"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "Available"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"color": { "mode": "thresholds" },
|
"color": { "mode": "thresholds" },
|
||||||
"thresholds": {
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [{ "color": "blue", "value": null }]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"colorMode": "background",
|
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "stat",
|
"type": "piechart",
|
||||||
"id": 6,
|
"id": 6,
|
||||||
"title": "Ceph Cluster Health",
|
"title": "PVC Phase Distribution",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
|
||||||
"expr": "ceph_health_status",
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "B", "legendFormat": "Pending" },
|
||||||
"refId": "A"
|
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "C", "legendFormat": "Lost" }
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": { "defaults": { "color": { "mode": "palette-classic" } } },
|
||||||
"defaults": {
|
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "yellow", "value": 1 },
|
|
||||||
{ "color": "red", "value": 2 }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"mappings": [
|
|
||||||
{
|
|
||||||
"type": "value",
|
|
||||||
"options": {
|
|
||||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
|
||||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
|
||||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"options": {
|
"options": {
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"colorMode": "background",
|
"pieType": "pie",
|
||||||
"graphMode": "none",
|
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||||
"textMode": "value"
|
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "stat",
|
|
||||||
"id": 7,
|
|
||||||
"title": "OSDs Up / Total",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "Up"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "Total"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [{ "color": "green", "value": null }]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
|
||||||
"colorMode": "background",
|
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto"
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "row",
|
"type": "row",
|
||||||
"id": 8,
|
"id": 7,
|
||||||
"title": "Cluster Capacity",
|
"title": "Capacity by Storage Class",
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
|
||||||
"type": "gauge",
|
|
||||||
"id": 9,
|
|
||||||
"title": "Ceph Cluster Used (%)",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100,
|
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "yellow", "value": 70 },
|
|
||||||
{ "color": "red", "value": 85 }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
|
||||||
"showThresholdLabels": true,
|
|
||||||
"showThresholdMarkers": true
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "stat",
|
|
||||||
"id": 10,
|
|
||||||
"title": "Ceph Capacity — Total / Available",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "ceph_cluster_total_bytes",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "Total"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "Available"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "bytes",
|
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [{ "color": "blue", "value": null }]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "none",
|
|
||||||
"textMode": "auto",
|
|
||||||
"orientation": "vertical"
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"id": 11,
|
"id": 8,
|
||||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=\"Bound\"} == 1)\n * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info\n)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{storageclass}}"
|
"legendFormat": "{{storageclass}}"
|
||||||
}
|
}
|
||||||
@@ -316,11 +146,7 @@
|
|||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"unit": "bytes",
|
"unit": "bytes",
|
||||||
"color": { "mode": "palette-classic" },
|
"color": { "mode": "palette-classic" }
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [{ "color": "blue", "value": null }]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
@@ -329,267 +155,214 @@
|
|||||||
"displayMode": "gradient",
|
"displayMode": "gradient",
|
||||||
"showUnfilled": true
|
"showUnfilled": true
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 7 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "piechart",
|
"type": "bargauge",
|
||||||
"id": 12,
|
"id": 9,
|
||||||
"title": "PVC Phase Distribution",
|
"title": "PVC Count by Storage Class",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
"expr": "count by (storageclass) (kube_persistentvolumeclaim_info{storageclass!=\"\"})",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Bound"
|
"legendFormat": "{{storageclass}}"
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "Pending"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
|
||||||
"refId": "C",
|
|
||||||
"legendFormat": "Lost"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": { "color": { "mode": "palette-classic" } }
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": { "mode": "palette-classic" }
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"pieType": "pie",
|
"displayMode": "gradient",
|
||||||
"legend": {
|
"showUnfilled": true
|
||||||
"displayMode": "table",
|
},
|
||||||
"placement": "right",
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 7 }
|
||||||
"values": ["value", "percent"]
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "table",
|
||||||
|
"id": 10,
|
||||||
|
"title": "Storage Classes Summary",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by (storageclass) (kube_persistentvolume_info)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "PVs",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (storageclass) (kube_persistentvolume_capacity_bytes * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Capacity",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "merge" },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true },
|
||||||
|
"renameByName": { "storageclass": "StorageClass", "Value #A": "PV Count", "Value #B": "Total Capacity" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Total Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] }
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "row",
|
"type": "row",
|
||||||
"id": 13,
|
"id": 11,
|
||||||
"title": "Ceph Performance",
|
"title": "PVC Usage (kubelet volume stats)",
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "timeseries",
|
"type": "table",
|
||||||
"id": 14,
|
"id": 12,
|
||||||
"title": "Ceph Pool IOPS (Read / Write)",
|
"title": "Top 20 PVCs by % Used",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "rate(ceph_pool_rd[5m])",
|
"expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Read — pool {{pool_id}}"
|
"format": "table",
|
||||||
},
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
{
|
{
|
||||||
"expr": "rate(ceph_pool_wr[5m])",
|
"id": "organize",
|
||||||
"refId": "B",
|
"options": {
|
||||||
"legendFormat": "Write — pool {{pool_id}}"
|
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"persistentvolumeclaim": "PVC",
|
||||||
|
"Value": "Used %"
|
||||||
|
},
|
||||||
|
"indexByName": { "Namespace": 0, "PVC": 1, "Used %": 2 }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {},
|
||||||
"unit": "ops",
|
"overrides": [
|
||||||
"color": { "mode": "palette-classic" },
|
|
||||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "timeseries",
|
"matcher": { "id": "byName", "options": "Used %" },
|
||||||
"id": 15,
|
"properties": [
|
||||||
"title": "Ceph Pool Throughput (Read / Write)",
|
{ "id": "unit", "value": "percent" },
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
{ "id": "decimals", "value": 1 },
|
||||||
"targets": [
|
|
||||||
{
|
{
|
||||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
"id": "custom.cellOptions",
|
||||||
"refId": "A",
|
"value": { "type": "color-background", "mode": "gradient" }
|
||||||
"legendFormat": "Read — pool {{pool_id}}"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
"id": "thresholds",
|
||||||
"refId": "B",
|
"value": {
|
||||||
"legendFormat": "Write — pool {{pool_id}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "Bps",
|
|
||||||
"color": { "mode": "palette-classic" },
|
|
||||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "row",
|
|
||||||
"id": 16,
|
|
||||||
"title": "Ceph OSD & Pool Details",
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "timeseries",
|
|
||||||
"id": 17,
|
|
||||||
"title": "Ceph Pool Space Used (%)",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "Pool {{pool_id}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100,
|
|
||||||
"color": { "mode": "palette-classic" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{ "color": "green", "value": null },
|
{ "color": "green", "value": null },
|
||||||
{ "color": "yellow", "value": 70 },
|
{ "color": "yellow", "value": 70 },
|
||||||
{ "color": "red", "value": 85 }
|
{ "color": "red", "value": 85 }
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 16 }
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"id": 18,
|
"id": 13,
|
||||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
"title": "Top 20 PVCs by Used Bytes",
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "ceph_osd_up",
|
"expr": "topk(20, max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{ceph_daemon}}"
|
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}",
|
||||||
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"min": 0,
|
"unit": "bytes",
|
||||||
"max": 1,
|
"color": { "mode": "palette-classic" }
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{ "color": "red", "value": null },
|
|
||||||
{ "color": "green", "value": 1 }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"mappings": [
|
|
||||||
{
|
|
||||||
"type": "value",
|
|
||||||
"options": {
|
|
||||||
"0": { "text": "DOWN", "index": 0 },
|
|
||||||
"1": { "text": "UP", "index": 1 }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"orientation": "horizontal",
|
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
|
||||||
"displayMode": "basic",
|
|
||||||
"showUnfilled": true
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "row",
|
|
||||||
"id": 19,
|
|
||||||
"title": "Node Disk Usage",
|
|
||||||
"collapsed": false,
|
|
||||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "timeseries",
|
|
||||||
"id": 20,
|
|
||||||
"title": "Node Root Disk Usage Over Time (%)",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "{{instance}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100,
|
|
||||||
"color": { "mode": "palette-classic" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "yellow", "value": 70 },
|
|
||||||
{ "color": "red", "value": 85 }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
|
||||||
},
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "bargauge",
|
|
||||||
"id": 21,
|
|
||||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
|
||||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100,
|
|
||||||
"color": { "mode": "thresholds" },
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{ "color": "green", "value": null },
|
|
||||||
{ "color": "yellow", "value": 70 },
|
|
||||||
{ "color": "red", "value": 85 }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"orientation": "horizontal",
|
"orientation": "horizontal",
|
||||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
"displayMode": "gradient",
|
"displayMode": "gradient",
|
||||||
"showUnfilled": true
|
"showUnfilled": true,
|
||||||
|
"valueMode": "color",
|
||||||
|
"sortBy": "Value",
|
||||||
|
"sortOrder": "desc"
|
||||||
},
|
},
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 16 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 14,
|
||||||
|
"title": "Top 5 PVCs Usage Over Time (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(5,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 15,
|
||||||
|
"title": "PVC Inode Usage (%) — Top 20",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes)\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 1, "fillOpacity": 5 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 }
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
|||||||
|
|
||||||
Ok(Outcome::success(format!(
|
Ok(Outcome::success(format!(
|
||||||
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||||
self.namespace, 8
|
self.namespace, 9
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -494,7 +494,11 @@ impl ClusterDashboardsInterpret {
|
|||||||
include_str!("dashboards/workloads-health.json"),
|
include_str!("dashboards/workloads-health.json"),
|
||||||
),
|
),
|
||||||
("okd-networking", include_str!("dashboards/networking.json")),
|
("okd-networking", include_str!("dashboards/networking.json")),
|
||||||
("storage-health", include_str!("dashboards/storage.json")),
|
(
|
||||||
|
"persistent-storage",
|
||||||
|
include_str!("dashboards/storage.json"),
|
||||||
|
),
|
||||||
|
("ceph-cluster", include_str!("dashboards/ceph.json")),
|
||||||
("okd-etcd", include_str!("dashboards/etcd.json")),
|
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||||
(
|
(
|
||||||
"okd-control-plane",
|
"okd-control-plane",
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
pub mod alert_channel;
|
pub mod alert_channel;
|
||||||
pub mod alert_rule;
|
pub mod alert_rule;
|
||||||
pub mod application_monitoring;
|
pub mod application_monitoring;
|
||||||
|
pub mod ceph_alerts;
|
||||||
pub mod cluster_dashboards;
|
pub mod cluster_dashboards;
|
||||||
pub mod grafana;
|
pub mod grafana;
|
||||||
pub mod kube_prometheus;
|
pub mod kube_prometheus;
|
||||||
|
|||||||
114
harmony/src/modules/monitoring/okd/cluster_alert_rules.rs
Normal file
114
harmony/src/modules/monitoring/okd/cluster_alert_rules.rs
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use kube::api::ObjectMeta;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
data::Version,
|
||||||
|
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||||
|
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||||
|
},
|
||||||
|
score::Score,
|
||||||
|
topology::{K8sclient, Topology},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct OpenshiftPrometheusRuleScore {
|
||||||
|
pub namespace: String,
|
||||||
|
pub name: String,
|
||||||
|
pub rule_groups: Vec<RuleGroup>,
|
||||||
|
pub labels: Option<BTreeMap<String, String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Topology + K8sclient> Score<T> for OpenshiftPrometheusRuleScore {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
format!(
|
||||||
|
"OpenshiftPrometheusRuleScore({}/{})",
|
||||||
|
self.namespace, self.name
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||||
|
Box::new(OpenshiftPrometheusRuleInterpret {
|
||||||
|
namespace: self.namespace.clone(),
|
||||||
|
name: self.name.clone(),
|
||||||
|
rule_groups: self.rule_groups.clone(),
|
||||||
|
labels: self.labels.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct OpenshiftPrometheusRuleInterpret {
|
||||||
|
namespace: String,
|
||||||
|
name: String,
|
||||||
|
rule_groups: Vec<RuleGroup>,
|
||||||
|
labels: Option<BTreeMap<String, String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T: Topology + K8sclient> Interpret<T> for OpenshiftPrometheusRuleInterpret {
|
||||||
|
async fn execute(
|
||||||
|
&self,
|
||||||
|
_inventory: &Inventory,
|
||||||
|
topology: &T,
|
||||||
|
) -> Result<Outcome, InterpretError> {
|
||||||
|
let labels = self.labels.clone().unwrap_or_else(default_rule_labels);
|
||||||
|
|
||||||
|
let prometheus_rule = PrometheusRule {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(self.name.clone()),
|
||||||
|
namespace: Some(self.namespace.clone()),
|
||||||
|
labels: Some(labels),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
spec: PrometheusRuleSpec {
|
||||||
|
groups: self.rule_groups.clone(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = topology
|
||||||
|
.k8s_client()
|
||||||
|
.await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||||
|
|
||||||
|
client
|
||||||
|
.apply(&prometheus_rule, Some(&self.namespace))
|
||||||
|
.await
|
||||||
|
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||||
|
|
||||||
|
Ok(Outcome::success(format!(
|
||||||
|
"PrometheusRule '{}' applied to namespace '{}' with {} rule group(s)",
|
||||||
|
self.name,
|
||||||
|
self.namespace,
|
||||||
|
self.rule_groups.len()
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_name(&self) -> InterpretName {
|
||||||
|
InterpretName::Custom("OpenshiftPrometheusRule")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_version(&self) -> Version {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_status(&self) -> InterpretStatus {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_children(&self) -> Vec<Id> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_rule_labels() -> BTreeMap<String, String> {
|
||||||
|
let mut labels = BTreeMap::new();
|
||||||
|
labels.insert("prometheus".to_string(), "k8s".to_string());
|
||||||
|
labels.insert("role".to_string(), "alert-rules".to_string());
|
||||||
|
labels
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
use crate::topology::oberservability::monitoring::AlertSender;
|
use crate::topology::oberservability::monitoring::AlertSender;
|
||||||
|
|
||||||
|
pub mod cluster_alert_rules;
|
||||||
pub mod cluster_monitoring;
|
pub mod cluster_monitoring;
|
||||||
pub(crate) mod config;
|
pub(crate) mod config;
|
||||||
pub mod enable_user_workload;
|
pub mod enable_user_workload;
|
||||||
|
|||||||
Reference in New Issue
Block a user