diff --git a/Cargo.lock b/Cargo.lock index db2929dc..007854cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2873,6 +2873,17 @@ dependencies = [ "url", ] +[[package]] +name = "example-okd-ceph-alerts" +version = "0.1.0" +dependencies = [ + "harmony", + "harmony_cli", + "harmony_types", + "log", + "tokio", +] + [[package]] name = "example-okd-cluster-alerts" version = "0.1.0" diff --git a/examples/okd_ceph_alerts/Cargo.toml b/examples/okd_ceph_alerts/Cargo.toml new file mode 100644 index 00000000..7301242d --- /dev/null +++ b/examples/okd_ceph_alerts/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "example-okd-ceph-alerts" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +tokio = { workspace = true } +log = { workspace = true } diff --git a/examples/okd_ceph_alerts/env.sh b/examples/okd_ceph_alerts/env.sh new file mode 100644 index 00000000..08072655 --- /dev/null +++ b/examples/okd_ceph_alerts/env.sh @@ -0,0 +1,4 @@ +export HARMONY_SECRET_NAMESPACE=okd_ceph_alerts_example +export HARMONY_SECRET_STORE=file +export HARMONY_DATABASE_URL=sqlite://harmony_okd_ceph_alerts_example.sqlite +export RUST_LOG=harmony=debug diff --git a/examples/okd_ceph_alerts/src/main.rs b/examples/okd_ceph_alerts/src/main.rs new file mode 100644 index 00000000..33bfa1ca --- /dev/null +++ b/examples/okd_ceph_alerts/src/main.rs @@ -0,0 +1,28 @@ +use harmony::{ + inventory::Inventory, + modules::monitoring::{ + ceph_alerts::ceph_alert_rule_groups, okd::cluster_alert_rules::OpenshiftPrometheusRuleScore, + }, + topology::K8sAnywhereTopology, +}; + +#[tokio::main] +async fn main() { + harmony_cli::cli_logger::init(); + + let ceph_rules = OpenshiftPrometheusRuleScore { + namespace: "rook-ceph".to_string(), + name: "ceph-alerts".to_string(), + rule_groups: ceph_alert_rule_groups(), + labels: None, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(ceph_rules)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony/src/modules/monitoring/ceph_alerts.rs b/harmony/src/modules/monitoring/ceph_alerts.rs new file mode 100644 index 00000000..88044d75 --- /dev/null +++ b/harmony/src/modules/monitoring/ceph_alerts.rs @@ -0,0 +1,167 @@ +use std::collections::BTreeMap; + +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{Rule, RuleGroup}; + +pub fn ceph_alert_rule_groups() -> Vec { + vec![ + RuleGroup { + name: "ceph-cluster-health".to_string(), + rules: vec![ + alert( + "CephHealthWarn", + "max(ceph_health_status) == 1", + Some("15m"), + "warning", + "Ceph cluster health is WARN", + "Ceph reports HEALTH_WARN for more than 15 minutes. Run `ceph -s` or check the Ceph dashboard to see active health checks.", + ), + alert( + "CephHealthErr", + "max(ceph_health_status) == 2", + Some("5m"), + "critical", + "Ceph cluster health is ERR", + "Ceph reports HEALTH_ERR for more than 5 minutes. Immediate investigation required.", + ), + alert( + "CephMonDown", + "count(max by (ceph_daemon) (ceph_mon_quorum_status == 0)) > 0", + Some("5m"), + "critical", + "Ceph monitor is out of quorum", + "One or more Ceph monitors are not in quorum. Quorum loss risks cluster availability.", + ), + alert( + "CephMgrAbsent", + "sum(max by (ceph_daemon) (ceph_mgr_status)) < 1", + Some("5m"), + "critical", + "No active Ceph manager", + "No Ceph manager daemon is currently active. Dashboards and orchestration will be unavailable.", + ), + ], + }, + RuleGroup { + name: "ceph-osd".to_string(), + rules: vec![ + alert( + "CephOSDDown", + "count(max by (ceph_daemon) (ceph_osd_up == 0)) > 0", + Some("5m"), + "warning", + "One or more Ceph OSDs are down", + "At least one OSD daemon is reporting down for 5 minutes. Data redundancy may be reduced.", + ), + alert( + "CephOSDNearFull", + "max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 80", + Some("15m"), + "warning", + "Ceph OSD is near full", + "OSD {{ $labels.ceph_daemon }} is above 80% utilization. Rebalance or add capacity.", + ), + alert( + "CephOSDFull", + "max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 90", + Some("5m"), + "critical", + "Ceph OSD is critically full", + "OSD {{ $labels.ceph_daemon }} is above 90% utilization. Writes may block. Act immediately.", + ), + ], + }, + RuleGroup { + name: "ceph-capacity".to_string(), + rules: vec![ + alert( + "CephClusterNearFull", + "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 75", + Some("15m"), + "warning", + "Ceph cluster is near full", + "Cluster raw utilization is above 75% for 15 minutes.", + ), + alert( + "CephClusterCriticallyFull", + "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 85", + Some("5m"), + "critical", + "Ceph cluster is critically full", + "Cluster raw utilization is above 85%. Imminent risk of write unavailability.", + ), + alert( + "CephPoolNearFull", + "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail)) > 80", + Some("15m"), + "warning", + "Ceph pool is near full", + "Pool (pool_id {{ $labels.pool_id }}) is above 80% usage.", + ), + alert( + "CephDaysUntilFull", + "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)) / clamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1) / 86400 < 30", + Some("1h"), + "warning", + "Ceph cluster predicted to fill within 30 days", + "Based on the 7-day usage trend, the cluster will reach capacity in less than 30 days.", + ), + ], + }, + RuleGroup { + name: "ceph-placement-groups".to_string(), + rules: vec![ + alert( + "CephPGsNotActiveClean", + "max(ceph_pg_total) - max(ceph_pg_clean) > 0", + Some("15m"), + "warning", + "Some placement groups are not active+clean", + "{{ $value }} PGs have been in a non-clean state for more than 15 minutes.", + ), + alert( + "CephSlowOps", + "max(ceph_healthcheck_slow_ops) > 0", + Some("5m"), + "warning", + "Ceph reports slow ops", + "Ceph has {{ $value }} slow operations outstanding for more than 5 minutes.", + ), + ], + }, + RuleGroup { + name: "ceph-nodes".to_string(), + rules: vec![alert( + "CephNodeRootDiskUsage", + "100 * (1 - (max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}) / max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}))) > 85", + Some("10m"), + "warning", + "Ceph node root/var disk above 85%", + "Node {{ $labels.instance }} mountpoint {{ $labels.mountpoint }} is above 85% disk usage. OSDs on this node may be at risk.", + )], + }, + ] +} + +fn alert( + name: &str, + expr: &str, + for_: Option<&str>, + severity: &str, + summary: &str, + description: &str, +) -> Rule { + let mut labels = BTreeMap::new(); + labels.insert("severity".to_string(), severity.to_string()); + + let mut annotations = BTreeMap::new(); + annotations.insert("summary".to_string(), summary.to_string()); + annotations.insert("description".to_string(), description.to_string()); + + Rule { + alert: Some(name.to_string()), + expr: Some(expr.to_string()), + for_: for_.map(|s| s.to_string()), + labels: Some(labels), + annotations: Some(annotations), + } +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json new file mode 100644 index 00000000..d555511d --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -0,0 +1,674 @@ +{ + "title": "Ceph Cluster", + "uid": "ceph-cluster", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + + "templating": { + "list": [ + { + "name": "pool", + "type": "query", + "label": "Pool", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ceph_pool_metadata, name)", "refId": "Pool" }, + "definition": "label_values(ceph_pool_metadata, name)", + "multi": true, + "includeAll": true, + "current": { "text": "All", "value": "$__all", "selected": false }, + "refresh": 1, + "sort": 1 + }, + { + "name": "osd", + "type": "query", + "label": "OSD", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ceph_osd_metadata, ceph_daemon)", "refId": "OSD" }, + "definition": "label_values(ceph_osd_metadata, ceph_daemon)", + "multi": true, + "includeAll": true, + "current": { "text": "All", "value": "$__all", "selected": false }, + "refresh": 1, + "sort": 1 + } + ] + }, + + "panels": [ + + { + "type": "row", "id": 1, "title": "Cluster Status", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "type": "stat", "id": 2, "title": "Health", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(ceph_health_status)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 2 } + ]}, + "mappings": [{ + "type": "value", + "options": { + "0": { "text": "HEALTH_OK", "index": 0 }, + "1": { "text": "HEALTH_WARN", "index": 1 }, + "2": { "text": "HEALTH_ERR", "index": 2 } + } + }] + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "value" + }, + "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } + }, + + { + "type": "stat", "id": 3, "title": "Mon Quorum", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "count(max by (ceph_daemon) (ceph_mon_quorum_status == 1)) or vector(0)", "refId": "A", "legendFormat": "In Quorum" }, + { "expr": "count(max by (ceph_daemon) (ceph_mon_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } + }, + + { + "type": "stat", "id": 4, "title": "MGR Active", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(max by (ceph_daemon) (ceph_mgr_status)) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 3, "x": 8, "y": 1 } + }, + + { + "type": "stat", "id": 5, "title": "OSDs Up / In / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" }, + { "expr": "sum(max by (ceph_daemon) (ceph_osd_in)) or vector(0)", "refId": "B", "legendFormat": "In" }, + { "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "C", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 5, "x": 11, "y": 1 } + }, + + { + "type": "stat", "id": 6, "title": "Pools", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(max by (pool_id) (ceph_pool_metadata)) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 3, "x": 16, "y": 1 } + }, + + { + "type": "stat", "id": 7, "title": "PGs Active+Clean / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_pg_clean) or vector(0)", "refId": "A", "legendFormat": "Active+Clean" }, + { "expr": "max(ceph_pg_total) or vector(0)", "refId": "B", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 5, "x": 19, "y": 1 } + }, + + { + "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + + { + "type": "gauge", "id": 9, "title": "Cluster Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)", + "refId": "A" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "showThresholdLabels": true, "showThresholdMarkers": true + }, + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + }, + + { + "type": "stat", "id": 10, "title": "Total / Used / Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" }, + { "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" }, + { "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "C", "legendFormat": "Available" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical" + }, + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + }, + + { + "type": "timeseries", "id": 11, "title": "Capacity Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" }, + { "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 } + }, + + { + "type": "stat", "id": 12, "title": "Days Until Full (predicted, 7d trend)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes))\n/\nclamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1)\n/ 86400", + "refId": "A" + }], + "fieldConfig": { + "defaults": { + "unit": "d", + "decimals": 1, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 14 }, + { "color": "green", "value": 60 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 } + }, + + { + "type": "bargauge", "id": 13, "title": "Pool Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "(\n 100 * max by (pool_id) (ceph_pool_bytes_used)\n /\n (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))\n)\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "A", + "legendFormat": "{{name}}", + "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true, + "valueMode": "color", + "sortBy": "Value", + "sortOrder": "desc" + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 } + }, + + { + "type": "bargauge", "id": 14, "title": "OSD Utilization (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", + "refId": "A", + "legendFormat": "{{ceph_daemon}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 } + }, + + { + "type": "row", "id": 15, "title": "Performance", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 } + }, + + { + "type": "timeseries", "id": 16, "title": "Cluster IOPS (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (pool_id) (rate(ceph_pool_rd[5m])))", "refId": "A", "legendFormat": "Read" }, + { "expr": "sum(max by (pool_id) (rate(ceph_pool_wr[5m])))", "refId": "B", "legendFormat": "Write" } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 } + }, + + { + "type": "timeseries", "id": 17, "title": "Cluster Throughput (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (pool_id) (rate(ceph_pool_rd_bytes[5m])))", "refId": "A", "legendFormat": "Read" }, + { "expr": "sum(max by (pool_id) (rate(ceph_pool_wr_bytes[5m])))", "refId": "B", "legendFormat": "Write" } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 } + }, + + { + "type": "timeseries", "id": 18, "title": "Client Op Latency (Avg)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(ceph_osd_op_r_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_r_latency_count[5m])), 1)", + "refId": "A", "legendFormat": "Read" + }, + { + "expr": "sum(rate(ceph_osd_op_w_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_w_latency_count[5m])), 1)", + "refId": "B", "legendFormat": "Write" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 } + }, + + { + "type": "timeseries", "id": 19, "title": "Recovery Throughput", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(rate(ceph_osd_recovery_bytes[5m])) or vector(0)", "refId": "A", "legendFormat": "Recovery B/s" }, + { "expr": "sum(rate(ceph_osd_recovery_ops[5m])) or vector(0)", "refId": "B", "legendFormat": "Recovery ops/s" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Recovery B/s" }, "properties": [{ "id": "unit", "value": "Bps" }] }, + { "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 } + }, + + { + "type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 } + }, + + { + "type": "timeseries", "id": 21, "title": "PG States Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_pg_clean)", "refId": "A", "legendFormat": "clean" }, + { "expr": "max(ceph_pg_active)", "refId": "B", "legendFormat": "active" }, + { "expr": "max(ceph_pg_degraded)", "refId": "C", "legendFormat": "degraded" }, + { "expr": "max(ceph_pg_undersized)", "refId": "D", "legendFormat": "undersized" }, + { "expr": "max(ceph_pg_peering)", "refId": "E", "legendFormat": "peering" }, + { "expr": "max(ceph_pg_recovering)", "refId": "F", "legendFormat": "recovering" }, + { "expr": "max(ceph_pg_backfilling)", "refId": "G", "legendFormat": "backfilling" }, + { "expr": "max(ceph_pg_remapped)", "refId": "H", "legendFormat": "remapped" }, + { "expr": "max(ceph_pg_inconsistent)", "refId": "I", "legendFormat": "inconsistent" }, + { "expr": "max(ceph_pg_stale)", "refId": "J", "legendFormat": "stale" }, + { "expr": "max(ceph_pg_unknown)", "refId": "K", "legendFormat": "unknown" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0 } + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["max", "lastNotNull"], + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 } + }, + + { + "type": "stat", "id": 22, "title": "Slow Ops", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(ceph_healthcheck_slow_ops) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "area", "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 } + }, + + { + "type": "stat", "id": 23, "title": "Misplaced / Degraded Objects", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_num_objects_misplaced) or vector(0)", "refId": "A", "legendFormat": "Misplaced" }, + { "expr": "max(ceph_num_objects_degraded) or vector(0)", "refId": "B", "legendFormat": "Degraded" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 } + }, + + { + "type": "row", "id": 24, "title": "OSD Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 } + }, + + { + "type": "table", "id": 25, "title": "OSDs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (ceph_daemon) (ceph_osd_up{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "Up", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_in{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "In", "format": "table", "instant": true }, + { "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", "refId": "C", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_numpg{ceph_daemon=~\"$osd\"})", "refId": "D", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "E", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "F", "format": "table", "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true }, + "renameByName": { + "ceph_daemon": "OSD", + "Value #A": "Up", + "Value #B": "In", + "Value #C": "Util %", + "Value #D": "PGs", + "Value #E": "Apply Latency", + "Value #F": "Commit Latency" + }, + "indexByName": { + "OSD": 0, "Up": 1, "In": 2, "Util %": 3, "PGs": 4, "Apply Latency": 5, "Commit Latency": 6 + } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Util %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]}} + ] + }, + { "matcher": { "id": "byName", "options": "Apply Latency" }, "properties": [{ "id": "unit", "value": "ms" }] }, + { "matcher": { "id": "byName", "options": "Commit Latency" }, "properties": [{ "id": "unit", "value": "ms" }] }, + { + "matcher": { "id": "byRegexp", "options": "Up|In" }, + "properties": [ + { "id": "mappings", "value": [{ "type": "value", "options": { "0": { "text": "✗", "index": 0 }, "1": { "text": "✓", "index": 1 }}}] }, + { "id": "custom.cellOptions", "value": { "type": "color-text" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}} + ] + } + ] + }, + "gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 } + }, + + { + "type": "timeseries", "id": 26, "title": "OSD Apply + Commit Latency", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "{{ceph_daemon}} apply" }, + { "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "{{ceph_daemon}} commit" } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0 } + } + }, + "gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 } + }, + + { + "type": "row", "id": 27, "title": "Pool Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } + }, + + { + "type": "table", "id": 28, "title": "Pools", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", "refId": "A", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_objects)", "refId": "B", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_bytes_used)", "refId": "C", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_max_avail)", "refId": "D", "format": "table", "instant": true }, + { "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "E", "format": "table", "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "Value #A": true }, + "renameByName": { + "pool_id": "ID", + "name": "Pool", + "Value #B": "Objects", + "Value #C": "Used", + "Value #D": "Available", + "Value #E": "Used %" + }, + "indexByName": { "ID": 0, "Pool": 1, "Objects": 2, "Used": 3, "Available": 4, "Used %": 5 } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]}} + ] + } + ] + }, + "gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 } + }, + + { + "type": "timeseries", "id": 29, "title": "Pool IOPS (Read / Write) — filtered by $pool", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "A", "legendFormat": "Read — {{name}}" + }, + { + "expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "B", "legendFormat": "Write — {{name}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["max", "lastNotNull"], + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + } + }, + "gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json index 43079ce7..201f53a7 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json @@ -368,7 +368,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))", + "expr": "100 * (1 - (\n sum(node_filesystem_avail_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum(node_filesystem_size_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "Disk" } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json index 0b2fe9dd..01236b23 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json @@ -440,7 +440,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "{{instance}}" } @@ -467,7 +467,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "{{instance}}" } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json index 3c581842..dfaccf62 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json @@ -1,6 +1,6 @@ { - "title": "Storage Health", - "uid": "storage-health", + "title": "Persistent Storage", + "uid": "persistent-storage", "schemaVersion": 36, "version": 1, "refresh": "30s", @@ -21,25 +21,17 @@ "title": "Bound PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "green", "value": null }] - } + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } }, @@ -50,28 +42,19 @@ "title": "Pending PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 } - ] - } + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, { "color": "yellow", "value": 1 } + ]} } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } }, @@ -82,28 +65,19 @@ "title": "Lost PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 1 } - ] - } + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, { "color": "red", "value": 1 } + ]} } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 } }, @@ -114,201 +88,57 @@ "title": "Bound PVs / Available PVs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", - "refId": "A", - "legendFormat": "Bound" - }, - { - "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", - "refId": "B", - "legendFormat": "Available" - } + { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" }, + { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", "refId": "B", "legendFormat": "Available" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, - "gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 } + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 } }, { - "type": "stat", + "type": "piechart", "id": 6, - "title": "Ceph Cluster Health", + "title": "PVC Phase Distribution", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "ceph_health_status", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" }, + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "B", "legendFormat": "Pending" }, + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "C", "legendFormat": "Lost" } ], - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 }, - { "color": "red", "value": 2 } - ] - }, - "mappings": [ - { - "type": "value", - "options": { - "0": { "text": "HEALTH_OK", "index": 0 }, - "1": { "text": "HEALTH_WARN", "index": 1 }, - "2": { "text": "HEALTH_ERR", "index": 2 } - } - } - ] - } - }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "value" + "pieType": "pie", + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] } }, - "gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 } - }, - - { - "type": "stat", - "id": 7, - "title": "OSDs Up / Total", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "sum(ceph_osd_up) or vector(0)", - "refId": "A", - "legendFormat": "Up" - }, - { - "expr": "count(ceph_osd_metadata) or vector(0)", - "refId": "B", - "legendFormat": "Total" - } - ], - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "green", "value": null }] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" - }, - "gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 } + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 } }, { "type": "row", - "id": 8, - "title": "Cluster Capacity", + "id": 7, + "title": "Capacity by Storage Class", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } }, - { - "type": "gauge", - "id": 9, - "title": "Ceph Cluster Used (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "showThresholdLabels": true, - "showThresholdMarkers": true - }, - "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } - }, - - { - "type": "stat", - "id": 10, - "title": "Ceph Capacity — Total / Available", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "ceph_cluster_total_bytes", - "refId": "A", - "legendFormat": "Total" - }, - { - "expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", - "refId": "B", - "legendFormat": "Available" - } - ], - "fieldConfig": { - "defaults": { - "unit": "bytes", - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "value", - "graphMode": "none", - "textMode": "auto", - "orientation": "vertical" - }, - "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } - }, - { "type": "bargauge", - "id": 11, + "id": 8, "title": "PV Allocated Capacity by Storage Class (Bound)", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)", + "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=\"Bound\"} == 1)\n * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info\n)", "refId": "A", "legendFormat": "{{storageclass}}" } @@ -316,11 +146,7 @@ "fieldConfig": { "defaults": { "unit": "bytes", - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } + "color": { "mode": "palette-classic" } } }, "options": { @@ -329,267 +155,214 @@ "displayMode": "gradient", "showUnfilled": true }, - "gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 } + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 7 } }, { - "type": "piechart", - "id": 12, - "title": "PVC Phase Distribution", + "type": "bargauge", + "id": 9, + "title": "PVC Count by Storage Class", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "expr": "count by (storageclass) (kube_persistentvolumeclaim_info{storageclass!=\"\"})", "refId": "A", - "legendFormat": "Bound" - }, - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", - "refId": "B", - "legendFormat": "Pending" - }, - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", - "refId": "C", - "legendFormat": "Lost" + "legendFormat": "{{storageclass}}" } ], "fieldConfig": { - "defaults": { "color": { "mode": "palette-classic" } } + "defaults": { + "unit": "short", + "color": { "mode": "palette-classic" } + } }, "options": { + "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"] }, - "pieType": "pie", - "legend": { - "displayMode": "table", - "placement": "right", - "values": ["value", "percent"] + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 7 } + }, + + { + "type": "table", + "id": 10, + "title": "Storage Classes Summary", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count by (storageclass) (kube_persistentvolume_info)", + "refId": "A", + "legendFormat": "PVs", + "format": "table", + "instant": true + }, + { + "expr": "sum by (storageclass) (kube_persistentvolume_capacity_bytes * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info)", + "refId": "B", + "legendFormat": "Capacity", + "format": "table", + "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true }, + "renameByName": { "storageclass": "StorageClass", "Value #A": "PV Count", "Value #B": "Total Capacity" } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { "matcher": { "id": "byName", "options": "Total Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] } + ] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 } }, { "type": "row", - "id": 13, - "title": "Ceph Performance", + "id": 11, + "title": "PVC Usage (kubelet volume stats)", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 } }, { - "type": "timeseries", - "id": 14, - "title": "Ceph Pool IOPS (Read / Write)", + "type": "table", + "id": 12, + "title": "Top 20 PVCs by % Used", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "rate(ceph_pool_rd[5m])", + "expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)", "refId": "A", - "legendFormat": "Read — pool {{pool_id}}" - }, + "format": "table", + "instant": true + } + ], + "transformations": [ { - "expr": "rate(ceph_pool_wr[5m])", - "refId": "B", - "legendFormat": "Write — pool {{pool_id}}" + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true }, + "renameByName": { + "namespace": "Namespace", + "persistentvolumeclaim": "PVC", + "Value": "Used %" + }, + "indexByName": { "Namespace": 0, "PVC": 1, "Used %": 2 } + } } ], "fieldConfig": { - "defaults": { - "unit": "ops", - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 8 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 } - }, - - { - "type": "timeseries", - "id": 15, - "title": "Ceph Pool Throughput (Read / Write)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "rate(ceph_pool_rd_bytes[5m])", - "refId": "A", - "legendFormat": "Read — pool {{pool_id}}" - }, - { - "expr": "rate(ceph_pool_wr_bytes[5m])", - "refId": "B", - "legendFormat": "Write — pool {{pool_id}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps", - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 8 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 } - }, - - { - "type": "row", - "id": 16, - "title": "Ceph OSD & Pool Details", - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 } - }, - - { - "type": "timeseries", - "id": 17, - "title": "Ceph Pool Space Used (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)", - "refId": "A", - "legendFormat": "Pool {{pool_id}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - }, - "custom": { "lineWidth": 2, "fillOpacity": 10 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 } - }, - - { - "type": "bargauge", - "id": 18, - "title": "OSD Status per Daemon (green = Up, red = Down)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "ceph_osd_up", - "refId": "A", - "legendFormat": "{{ceph_daemon}}" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 1, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - }, - "mappings": [ - { - "type": "value", - "options": { - "0": { "text": "DOWN", "index": 0 }, - "1": { "text": "UP", "index": 1 } + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { + "id": "custom.cellOptions", + "value": { "type": "color-background", "mode": "gradient" } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } } - } - ] - } - }, - "options": { - "orientation": "horizontal", - "reduceOptions": { "calcs": ["lastNotNull"] }, - "displayMode": "basic", - "showUnfilled": true - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 } - }, - - { - "type": "row", - "id": 19, - "title": "Node Disk Usage", - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 } - }, - - { - "type": "timeseries", - "id": 20, - "title": "Node Root Disk Usage Over Time (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", - "refId": "A", - "legendFormat": "{{instance}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - }, - "custom": { "lineWidth": 2, "fillOpacity": 10 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 } - }, - - { - "type": "bargauge", - "id": 21, - "title": "Current Disk Usage — All Nodes & Mountpoints", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)", - "refId": "A", - "legendFormat": "{{instance}} — {{mountpoint}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } ] } + ] + }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 16 } + }, + + { + "type": "bargauge", + "id": 13, + "title": "Top 20 PVCs by Used Bytes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(20, max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes))", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" } } }, "options": { "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"] }, "displayMode": "gradient", - "showUnfilled": true + "showUnfilled": true, + "valueMode": "color", + "sortBy": "Value", + "sortOrder": "desc" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 } + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 16 } + }, + + { + "type": "timeseries", + "id": 14, + "title": "Top 5 PVCs Usage Over Time (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(5,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 } + }, + + { + "type": "timeseries", + "id": 15, + "title": "PVC Inode Usage (%) — Top 20", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes)\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 5 } + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 } } ] diff --git a/harmony/src/modules/monitoring/cluster_dashboards/score.rs b/harmony/src/modules/monitoring/cluster_dashboards/score.rs index 22f916d7..ed52ed12 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/score.rs +++ b/harmony/src/modules/monitoring/cluster_dashboards/score.rs @@ -101,7 +101,7 @@ impl Interpret for ClusterDashboardsInterpret { Ok(Outcome::success(format!( "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", - self.namespace, 8 + self.namespace, 9 ))) } @@ -494,7 +494,11 @@ impl ClusterDashboardsInterpret { include_str!("dashboards/workloads-health.json"), ), ("okd-networking", include_str!("dashboards/networking.json")), - ("storage-health", include_str!("dashboards/storage.json")), + ( + "persistent-storage", + include_str!("dashboards/storage.json"), + ), + ("ceph-cluster", include_str!("dashboards/ceph.json")), ("okd-etcd", include_str!("dashboards/etcd.json")), ( "okd-control-plane", diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index aa08e7a8..0c0336eb 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,6 +1,7 @@ pub mod alert_channel; pub mod alert_rule; pub mod application_monitoring; +pub mod ceph_alerts; pub mod cluster_dashboards; pub mod grafana; pub mod kube_prometheus; diff --git a/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs b/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs new file mode 100644 index 00000000..fb8c7189 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs @@ -0,0 +1,114 @@ +use std::collections::BTreeMap; + +use async_trait::async_trait; +use harmony_types::id::Id; +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ + PrometheusRule, PrometheusRuleSpec, RuleGroup, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftPrometheusRuleScore { + pub namespace: String, + pub name: String, + pub rule_groups: Vec, + pub labels: Option>, +} + +impl Score for OpenshiftPrometheusRuleScore { + fn name(&self) -> String { + format!( + "OpenshiftPrometheusRuleScore({}/{})", + self.namespace, self.name + ) + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftPrometheusRuleInterpret { + namespace: self.namespace.clone(), + name: self.name.clone(), + rule_groups: self.rule_groups.clone(), + labels: self.labels.clone(), + }) + } +} + +#[derive(Debug, Clone)] +pub struct OpenshiftPrometheusRuleInterpret { + namespace: String, + name: String, + rule_groups: Vec, + labels: Option>, +} + +#[async_trait] +impl Interpret for OpenshiftPrometheusRuleInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let labels = self.labels.clone().unwrap_or_else(default_rule_labels); + + let prometheus_rule = PrometheusRule { + metadata: ObjectMeta { + name: Some(self.name.clone()), + namespace: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + spec: PrometheusRuleSpec { + groups: self.rule_groups.clone(), + }, + }; + + let client = topology + .k8s_client() + .await + .map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?; + + client + .apply(&prometheus_rule, Some(&self.namespace)) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(Outcome::success(format!( + "PrometheusRule '{}' applied to namespace '{}' with {} rule group(s)", + self.name, + self.namespace, + self.rule_groups.len() + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftPrometheusRule") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +fn default_rule_labels() -> BTreeMap { + let mut labels = BTreeMap::new(); + labels.insert("prometheus".to_string(), "k8s".to_string()); + labels.insert("role".to_string(), "alert-rules".to_string()); + labels +} diff --git a/harmony/src/modules/monitoring/okd/mod.rs b/harmony/src/modules/monitoring/okd/mod.rs index ac246c5f..76d8b58b 100644 --- a/harmony/src/modules/monitoring/okd/mod.rs +++ b/harmony/src/modules/monitoring/okd/mod.rs @@ -1,5 +1,6 @@ use crate::topology::oberservability::monitoring::AlertSender; +pub mod cluster_alert_rules; pub mod cluster_monitoring; pub(crate) mod config; pub mod enable_user_workload;