From 391c44b369ccee574edf5bd0f55b671bfcd047df Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:29:54 -0400 Subject: [PATCH 1/3] feat: add the datadog-15-k8s-metrics dashboard --- .../dashboards/datadog-15-k8s-metrics.json | 852 ++++++++++++++++++ .../monitoring/cluster_dashboards/score.rs | 6 +- 2 files changed, 857 insertions(+), 1 deletion(-) create mode 100644 harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json new file mode 100644 index 00000000..af699af4 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json @@ -0,0 +1,852 @@ +{ + "title": "Datadog — 15 Key Kubernetes Metrics", + "uid": "datadog-15-k8s-metrics", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["kubernetes", "datadog", "key-metrics", "cluster", "control-plane"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + }, + { + "name": "node", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_node_info, node)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Node", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 100, "type": "row", "title": "Cluster State — metrics 1–3 (Node status, Desired vs current pods, Available vs unavailable pods)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "id": 1, "type": "stat", "title": "Ready Nodes", + "description": "Metric 1 — Node status. Count of nodes with condition Ready=true. A node that drops out of Ready can no longer accept new pods; scheduling freezes until it recovers or is drained.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 } + }, + + { + "id": 2, "type": "stat", "title": "Not Ready Nodes", + "description": "Nodes reporting Ready=false. These nodes cannot host new pods and existing pods may be evicted. Alert immediately.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 } + }, + + { + "id": 3, "type": "stat", "title": "MemoryPressure", + "description": "Nodes flagged by kubelet as being under memory pressure. The kubelet will begin evicting pods that most exceed their memory request.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 } + }, + + { + "id": 4, "type": "stat", "title": "DiskPressure", + "description": "Nodes under disk pressure. Kubelet runs GC (removing unused images and dead containers) and, if space stays low, starts evicting pods.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 } + }, + + { + "id": 5, "type": "stat", "title": "PIDPressure", + "description": "Nodes that have exhausted their PID space. New processes / containers on the node will fail to start.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 } + }, + + { + "id": 6, "type": "stat", "title": "NetworkUnavailable", + "description": "Nodes whose CNI has not (yet) wired the pod network. Pods cannot schedule onto the node until this clears.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"NetworkUnavailable\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 } + }, + + { + "id": 7, "type": "timeseries", "title": "Deployments — Desired vs Current pods", + "description": "Metric 2 — Desired vs current pods (Deployments). A persistent gap means pods cannot be scheduled: check node capacity, PodDisruptionBudgets, and image pull failures.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_deployment_spec_replicas{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "desired" }, + { "expr": "sum(kube_deployment_status_replicas{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "current" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "desired" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "current" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 8, "type": "timeseries", "title": "Deployments — Available vs Unavailable pods", + "description": "Metric 3 — Available/unavailable (Deployments). Spikes in unavailable are customer-visible: crashes, failed readiness probes, or resource shortages.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "available" }, + { "expr": "sum(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "unavailable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "available" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unavailable" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 9, "type": "table", "title": "Top Deployments with unavailable replicas", + "description": "Deployments that currently report unavailable replicas. Investigate pod events / readiness probes / resource quotas for these.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(20, max by(namespace, deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"}) > 0)", + "refId": "A", "legendFormat": "", "format": "table", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "short", "custom": { "align": "auto" }, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]} + } + }, + "options": { "showHeader": true }, + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "endpoint": true, "service": true, "pod": true, "container": true, "prometheus": true, "container_name": true, "namespace_labels": true }, "renameByName": { "Value": "unavailable" } } } + ], + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 10, "type": "timeseries", "title": "DaemonSets — Desired vs Scheduled", + "description": "Metric 2 — Desired vs current pods (DaemonSets). DaemonSets should have one pod per matching node; a gap means the pod cannot be placed (taints, resources, node selectors).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "desired" }, + { "expr": "sum(kube_daemonset_status_current_number_scheduled{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "scheduled" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "desired" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 } + }, + + { + "id": 11, "type": "timeseries", "title": "DaemonSets — Available vs Unavailable", + "description": "Metric 3 — Available/unavailable (DaemonSets). Unavailable DaemonSet pods often mean per-node infrastructure pods (CNI, logging, monitoring agents) are failing on specific nodes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_daemonset_status_number_available{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "available" }, + { "expr": "sum(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "unavailable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "available" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unavailable" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 } + }, + + { + "id": 200, "type": "row", "title": "Resources — Memory (metrics 4–6)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 } + }, + + { + "id": 20, "type": "timeseries", "title": "Cluster memory — usage vs requests vs limits", + "description": "Metrics 4–5 — aggregate. Compares how much memory containers actually consume (working set) to what they requested and what they are limited to. A pod that crosses its limit is OOMKilled.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})", "refId": "A", "legendFormat": "usage" }, + { "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})", "refId": "B", "legendFormat": "requests" }, + { "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})", "refId": "C", "legendFormat": "limits" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "usage" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "requests" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "limits" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 } + }, + + { + "id": 21, "type": "timeseries", "title": "Top 15 pods — memory usage / memory limit (%)", + "description": "Metric 4 — pod-level. Pods approaching 100% of their memory limit will be OOMKilled. If a pod persistently sits near the limit, either raise the limit or optimize memory use.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15,\n 100 * sum by(namespace, pod)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n /\n sum by(namespace, pod)(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})\n)", + "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 } + }, + + { + "id": 22, "type": "timeseries", "title": "Node memory — requests vs allocatable", + "description": "Metric 6 — per node. Compares the sum of pod memory requests placed on each node to the node's allocatable memory. If requests approach allocatable, the scheduler can no longer place new pods on that node.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(node)(kube_pod_container_resource_requests{resource=\"memory\",container!=\"\",node=~\"$node\"})", "refId": "A", "legendFormat": "{{node}} — requested" }, + { "expr": "sum by(node)(kube_node_status_allocatable{resource=\"memory\",node=~\"$node\"})", "refId": "B", "legendFormat": "{{node}} — allocatable" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 } + }, + + { + "id": 23, "type": "bargauge", "title": "Node memory commitment (requests / allocatable)", + "description": "How full each node is in terms of scheduled (requested) memory. ≥ 100% means no further pods requesting memory can be scheduled there.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 *\n sum by(node)(kube_pod_container_resource_requests{resource=\"memory\",container!=\"\",node=~\"$node\"})\n /\n sum by(node)(kube_node_status_allocatable{resource=\"memory\",node=~\"$node\"})", + "refId": "A", "legendFormat": "{{node}}", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 } + }, + + { + "id": 300, "type": "row", "title": "Resources — CPU (metrics 8–10)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 } + }, + + { + "id": 30, "type": "timeseries", "title": "Cluster CPU — usage vs requests vs limits", + "description": "Metrics 9–10 — aggregate. Unlike memory, CPU is compressible: exceeding a limit causes throttling (slow), not OOMKill. A persistent gap between usage and limits is fine; a persistent gap between usage and requests wastes capacity.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))", "refId": "A", "legendFormat": "usage" }, + { "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})", "refId": "B", "legendFormat": "requests" }, + { "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})", "refId": "C", "legendFormat": "limits" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "usage" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "requests" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "limits" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 39 } + }, + + { + "id": 31, "type": "timeseries", "title": "Top 15 pods — CPU usage / CPU limit (%)", + "description": "Metric 9 — pod-level. Pods that sit above 100% for long windows are being throttled by the kernel, which causes latency spikes even though the pod is not killed.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15,\n 100 * sum by(namespace, pod)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n /\n sum by(namespace, pod)(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})\n)", + "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 39 } + }, + + { + "id": 32, "type": "timeseries", "title": "Node CPU — requests vs allocatable", + "description": "Metric 8 — per node. Same shape as memory: once requests saturate allocatable CPU, no more pods requesting CPU can be placed on the node.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(node)(kube_pod_container_resource_requests{resource=\"cpu\",container!=\"\",node=~\"$node\"})", "refId": "A", "legendFormat": "{{node}} — requested" }, + { "expr": "sum by(node)(kube_node_status_allocatable{resource=\"cpu\",node=~\"$node\"})", "refId": "B", "legendFormat": "{{node}} — allocatable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 47 } + }, + + { + "id": 33, "type": "bargauge", "title": "Node CPU commitment (requests / allocatable)", + "description": "How full each node is in terms of scheduled (requested) CPU. ≥ 100% means no further pods requesting CPU can be scheduled there.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 *\n sum by(node)(kube_pod_container_resource_requests{resource=\"cpu\",container!=\"\",node=~\"$node\"})\n /\n sum by(node)(kube_node_status_allocatable{resource=\"cpu\",node=~\"$node\"})", + "refId": "A", "legendFormat": "{{node}}", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 47 } + }, + + { + "id": 400, "type": "row", "title": "Resources — Disk (metric 7)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } + }, + + { + "id": 40, "type": "timeseries", "title": "Node root filesystem usage (%)", + "description": "Metric 7 — node level. Disk is non-compressible: when it is exhausted, kubelet raises DiskPressure and evicts pods. Alert well before 100%.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * (1 - (\n sum by(instance)(node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum by(instance)(node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 56 } + }, + + { + "id": 41, "type": "table", "title": "Top 20 PVC usage (%)", + "description": "Metric 7 — volume level. Persistent volumes that fill up cause write errors inside applications. Alert at ~80% so there is time to expand or free space.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(20,\n 100 * max by(namespace, persistentvolumeclaim)(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\"})\n /\n max by(namespace, persistentvolumeclaim)(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\"})\n)", + "refId": "A", "legendFormat": "", "format": "table", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "custom": { "align": "auto", "cellOptions": { "type": "color-background" } }, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { "showHeader": true }, + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "endpoint": true, "service": true, "pod": true, "container": true, "prometheus": true }, "renameByName": { "Value": "usage %" } } } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 56 } + }, + + { + "id": 500, "type": "row", "title": "Control plane — etcd (metrics 11–12)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 64 } + }, + + { + "id": 50, "type": "stat", "title": "etcd has leader", + "description": "Metric 11 — etcd_server_has_leader. Minimum across members. 0 means at least one member does not see a leader — the cluster may be partitioned or mid-election.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "mappings": [{ + "type": "value", + "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "LEADER OK", "color": "green" } + } + }], + "unit": "short", "noValue": "?" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 65 } + }, + + { + "id": 51, "type": "stat", "title": "Leader changes (last 1h)", + "description": "Metric 12 — etcd_server_leader_changes_seen_total increase over 1h. Frequent elections usually mean network flapping or resource exhaustion on a member.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(increase(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0", "decimals": 0 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 65 } + }, + + { + "id": 52, "type": "timeseries", "title": "Leader changes rate per etcd member", + "description": "Per-member rate of leader transitions. A steady drumbeat on a single member points to that node specifically (its disk, its network).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_server_leader_changes_seen_total[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 65 } + }, + + { + "id": 53, "type": "timeseries", "title": "etcd has-leader per member", + "description": "Per-member value of etcd_server_has_leader. Any dip to 0 is the start of a leader election; frequent dips warrant investigation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "etcd_server_has_leader", "refId": "A", "legendFormat": "{{instance}}" }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "max": 1, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, "drawStyle": "line", "lineInterpolation": "stepAfter" } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["min", "lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 69 } + }, + + { + "id": 600, "type": "row", "title": "Control plane — API Server (metric 13)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 77 } + }, + + { + "id": 60, "type": "timeseries", "title": "API server request rate by verb", + "description": "Metric 13 — request count. Non-streaming calls per second by verb. Read-heavy (GET/LIST) load is usually controllers; write-heavy (POST/PUT/PATCH/DELETE) is user activity or autoscaling.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(verb)(rate(apiserver_request_total{verb!~\"WATCH|CONNECT\"}[5m]))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 78 } + }, + + { + "id": 61, "type": "timeseries", "title": "API server latency p50 / p95 / p99", + "description": "Metric 13 — request duration. Rising p99 with flat p50 is classic tail-latency degradation — look at a single slow resource or an overloaded admission webhook.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 78 } + }, + + { + "id": 62, "type": "timeseries", "title": "API server error rate (HTTP 4xx / 5xx)", + "description": "Error rate by code. 429 = inflight-limit/throttling; 422 = admission-webhook rejections / invalid objects; 500/503 = apiserver faults or etcd unavailability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(code)(rate(apiserver_request_total{code=~\"[45]..\"}[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 86 } + }, + + { + "id": 63, "type": "timeseries", "title": "API server p99 latency by resource", + "description": "Latency broken down by Kubernetes resource — helps identify which object kind (pods, secrets, events…) is the slow one.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(resource, le)(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m]))\n)", + "refId": "A", "legendFormat": "{{resource}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 86 } + }, + + { + "id": 700, "type": "row", "title": "Control plane — Controller Manager & Scheduler (metrics 14–15)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 94 } + }, + + { + "id": 70, "type": "timeseries", "title": "Workqueue wait (queue_duration) — p99 by queue", + "description": "Metric 14 — how long items sit in each controller's workqueue before being picked up. A rising line indicates the controller can no longer keep up with cluster changes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(name, le)(rate(workqueue_queue_duration_seconds_bucket[5m]))\n)", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 95 } + }, + + { + "id": 71, "type": "timeseries", "title": "Workqueue work (work_duration) — p99 by queue", + "description": "Metric 14 — how long each reconcile actually takes. A rising line points at slow API calls or a slow reconcile loop.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(name, le)(rate(workqueue_work_duration_seconds_bucket[5m]))\n)", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 95 } + }, + + { + "id": 72, "type": "timeseries", "title": "Scheduler — attempts per second by result", + "description": "Metric 15 — scheduler_schedule_attempts_total. 'unschedulable' = no node meets the pod's requirements (resources, taints, selectors); 'error' = a bug or stale cache in the scheduler.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))", + "refId": "A", "legendFormat": "{{result}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 103 } + }, + + { + "id": 73, "type": "timeseries", "title": "Scheduler — scheduling attempt latency (p50 / p95 / p99)", + "description": "Metric 15 — scheduler attempt duration. The PDF's scheduler_e2e_scheduling_duration_seconds was removed in Kubernetes 1.23; the modern equivalent is scheduler_scheduling_attempt_duration_seconds (time from picking a pod off the queue to binding it). A rising p99 often correlates with an overloaded apiserver or large, highly-constrained pod fleets.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 103 } + } + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/score.rs b/harmony/src/modules/monitoring/cluster_dashboards/score.rs index ed52ed12..7364c3c4 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/score.rs +++ b/harmony/src/modules/monitoring/cluster_dashboards/score.rs @@ -101,7 +101,7 @@ impl Interpret for ClusterDashboardsInterpret { Ok(Outcome::success(format!( "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", - self.namespace, 9 + self.namespace, 10 ))) } @@ -508,6 +508,10 @@ impl ClusterDashboardsInterpret { "okd-alerts-events", include_str!("dashboards/alerts-events-problems.json"), ), + ( + "datadog-15-k8s-metrics", + include_str!("dashboards/datadog-15-k8s-metrics.json"), + ), ]; for (dashboard_name, json_content) in dashboards { -- 2.39.5 From c2718e843b0e928aa5b142b9e278e1a36808a0ef Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:47:12 -0400 Subject: [PATCH 2/3] feat: improve ceph dashboard - list alerts and WHY its NOT green --- .../cluster_dashboards/dashboards/ceph.json | 262 ++++++++++++++++-- 1 file changed, 240 insertions(+), 22 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json index d555511d..6f5e0cc7 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -172,10 +172,228 @@ }, { - "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "type": "row", "id": 100, "title": "Active Issues", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } }, + { + "type": "stat", "id": 101, "title": "Critical Ceph alerts firing", + "description": "Count of Ceph alert rules currently in firing state with severity=critical. Drives the red tile on the Health stat to concrete action. 0 when the cluster is healthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\",severity=\"critical\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { + "colorMode": "background", "graphMode": "none", "justifyMode": "center", "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 7 } + }, + + { + "type": "stat", "id": 102, "title": "Warning Ceph alerts firing", + "description": "Count of Ceph alert rules currently in firing state with severity=warning. Matches what drives the yellow HEALTH_WARN tile on this dashboard.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\",severity=\"warning\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { + "colorMode": "background", "graphMode": "none", "justifyMode": "center", "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 7 } + }, + + { + "type": "row", "id": 104, "title": "Issue details — click to expand", "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, + "panels": [ + + { + "type": "table", "id": 105, "title": "Active Ceph health checks (ceph health detail)", + "description": "Exactly what `ceph health detail` would show. One row per active health check; the Check column is the Ceph check code (OSD_DOWN, POOL_NEARFULL, PG_DEGRADED, MON_CLOCK_SKEW, etc.). Severity is the Ceph-native HEALTH_WARN / HEALTH_ERR label emitted by the mgr prometheus module. An empty table means Ceph reports no active health checks — the Health tile above should be HEALTH_OK. This is the primary answer to 'why isn't it green?'.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ceph_health_detail == 1", + "refId": "A", "instant": true, "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "Value": true, + "ceph_health_detail":true, + "Time": true, + "prometheus": true, + "container": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true, + "pod": true, + "namespace": true + }, + "renameByName": { + "name": "Check", + "severity": "Severity" + }, + "indexByName": { + "severity": 0, + "name": 1 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left" }, + "noValue": "— HEALTH_OK, no active checks —" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 150 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "HEALTH_ERR": { "text": "HEALTH_ERR", "color": "dark-red", "index": 0 }, + "HEALTH_WARN": { "text": "HEALTH_WARN", "color": "dark-yellow", "index": 1 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Check" }, "properties": [{ "id": "custom.width", "value": 320 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 12 } + }, + + { + "type": "table", "id": 103, "title": "Firing Ceph alerts (Alertmanager view)", + "description": "Instant-query view of every Ceph alert currently firing — the same set that pages oncall through Alertmanager. Usually matches the health-checks table above, plus derived alerts that have no direct ceph_health_detail counterpart (CephDaysUntilFull, CephNodeRootDiskUsage). The ALERTS metric carries labels only, not annotations: alert name plus daemon/pool/instance labels should be enough to identify the problem; run `oc -n openshift-monitoring get prometheusrule ceph-alerts -o yaml` or check Alertmanager for the full summary/description.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\"}", + "refId": "A", "instant": true, "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "alertstate": true, + "__name__": true, + "Value": true, + "ALERTS": true, + "Time": true, + "prometheus": true, + "container": true, + "endpoint": true, + "job": true, + "service": true + }, + "renameByName": { + "alertname": "Alert Name", + "severity": "Severity", + "ceph_daemon": "Ceph Daemon", + "pool_id": "Pool", + "instance": "Node / Instance", + "mountpoint": "Mountpoint", + "namespace": "Namespace" + }, + "indexByName": { + "severity": 0, + "alertname": 1, + "ceph_daemon": 2, + "pool_id": 3, + "instance": 4, + "mountpoint": 5, + "namespace": 6 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left" }, + "noValue": "— no active Ceph issues —" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 110 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 }, + "warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 }, + "info": { "text": "INFO", "color": "dark-blue", "index": 2 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 280 }] }, + { "matcher": { "id": "byName", "options": "Ceph Daemon" }, "properties": [{ "id": "custom.width", "value": 180 }] }, + { "matcher": { "id": "byName", "options": "Pool" }, "properties": [{ "id": "custom.width", "value": 120 }] }, + { "matcher": { "id": "byName", "options": "Node / Instance" }, "properties": [{ "id": "custom.width", "value": 220 }] }, + { "matcher": { "id": "byName", "options": "Mountpoint" }, "properties": [{ "id": "custom.width", "value": 180 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 12 } + } + + ] + }, + + { + "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 } + }, + { "type": "gauge", "id": 9, "title": "Cluster Used (%)", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, @@ -198,7 +416,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": true, "showThresholdMarkers": true }, - "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 13 } }, { @@ -220,7 +438,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical" }, - "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 13 } }, { @@ -237,7 +455,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 } + "gridPos": { "h": 8, "w": 11, "x": 9, "y": 13 } }, { @@ -263,7 +481,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "none", "textMode": "auto" }, - "gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 } + "gridPos": { "h": 8, "w": 4, "x": 20, "y": 13 } }, { @@ -295,7 +513,7 @@ "sortBy": "Value", "sortOrder": "desc" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 } }, { @@ -323,12 +541,12 @@ "displayMode": "gradient", "showUnfilled": true }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 } }, { "type": "row", "id": 15, "title": "Performance", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 29 } }, { @@ -345,7 +563,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 } }, { @@ -362,7 +580,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 } }, { @@ -385,7 +603,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 } }, { @@ -405,12 +623,12 @@ { "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] } ] }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 } }, { "type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 46 } }, { @@ -446,7 +664,7 @@ "sortDesc": true } }, - "gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 } + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 47 } }, { @@ -467,7 +685,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "area", "textMode": "auto" }, - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 } + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 47 } }, { @@ -490,12 +708,12 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" }, - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 } + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 51 } }, { "type": "row", "id": 24, "title": "OSD Detail", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } }, { @@ -561,7 +779,7 @@ } ] }, - "gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 } + "gridPos": { "h": 10, "w": 16, "x": 0, "y": 56 } }, { @@ -578,12 +796,12 @@ "custom": { "lineWidth": 1, "fillOpacity": 0 } } }, - "gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 } + "gridPos": { "h": 10, "w": 8, "x": 16, "y": 56 } }, { "type": "row", "id": 27, "title": "Pool Detail", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 66 } }, { @@ -634,7 +852,7 @@ } ] }, - "gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 } + "gridPos": { "h": 10, "w": 14, "x": 0, "y": 67 } }, { @@ -667,7 +885,7 @@ "sortDesc": true } }, - "gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 } + "gridPos": { "h": 10, "w": 10, "x": 14, "y": 67 } } ] -- 2.39.5 From 349c2a13583f19cf49c6e6d9b69fbed8c31d538a Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:58:52 -0400 Subject: [PATCH 3/3] feat: improve ceph dashboard --- .../monitoring/cluster_dashboards/dashboards/ceph.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json index 6f5e0cc7..f54db405 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -278,7 +278,7 @@ { "matcher": { "id": "byName", "options": "Severity" }, "properties": [ - { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }, { "id": "custom.width", "value": 150 }, { "id": "mappings", @@ -357,7 +357,7 @@ { "matcher": { "id": "byName", "options": "Severity" }, "properties": [ - { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }, { "id": "custom.width", "value": 110 }, { "id": "mappings", -- 2.39.5