feat(monitoring): Ceph alerts integrated with OKD's native alerting stack #265

Merged
stremblay merged 3 commits from feat/ceph-alerts into master 2026-04-20 18:14:45 +00:00
13 changed files with 1222 additions and 431 deletions

11
Cargo.lock generated
View File

@@ -2873,6 +2873,17 @@ dependencies = [
"url", "url",
] ]
[[package]]
name = "example-okd-ceph-alerts"
version = "0.1.0"
dependencies = [
"harmony",
"harmony_cli",
"harmony_types",
"log",
"tokio",
]
[[package]] [[package]]
name = "example-okd-cluster-alerts" name = "example-okd-cluster-alerts"
version = "0.1.0" version = "0.1.0"

View File

@@ -0,0 +1,14 @@
[package]
name = "example-okd-ceph-alerts"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
publish = false
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
tokio = { workspace = true }
log = { workspace = true }

View File

@@ -0,0 +1,4 @@
export HARMONY_SECRET_NAMESPACE=okd_ceph_alerts_example
export HARMONY_SECRET_STORE=file
export HARMONY_DATABASE_URL=sqlite://harmony_okd_ceph_alerts_example.sqlite
export RUST_LOG=harmony=debug

View File

@@ -0,0 +1,28 @@
use harmony::{
inventory::Inventory,
modules::monitoring::{
ceph_alerts::ceph_alert_rule_groups, okd::cluster_alert_rules::OpenshiftPrometheusRuleScore,
},
topology::K8sAnywhereTopology,
};
#[tokio::main]
async fn main() {
harmony_cli::cli_logger::init();
let ceph_rules = OpenshiftPrometheusRuleScore {
namespace: "rook-ceph".to_string(),
name: "ceph-alerts".to_string(),
rule_groups: ceph_alert_rule_groups(),
labels: None,
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(ceph_rules)],
None,
)
.await
.unwrap();
}

View File

@@ -0,0 +1,167 @@
use std::collections::BTreeMap;
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{Rule, RuleGroup};
pub fn ceph_alert_rule_groups() -> Vec<RuleGroup> {
vec![
RuleGroup {
name: "ceph-cluster-health".to_string(),
rules: vec![
alert(
"CephHealthWarn",
"max(ceph_health_status) == 1",
Some("15m"),
"warning",
"Ceph cluster health is WARN",
"Ceph reports HEALTH_WARN for more than 15 minutes. Run `ceph -s` or check the Ceph dashboard to see active health checks.",
),
alert(
"CephHealthErr",
"max(ceph_health_status) == 2",
Some("5m"),
"critical",
"Ceph cluster health is ERR",
"Ceph reports HEALTH_ERR for more than 5 minutes. Immediate investigation required.",
),
alert(
"CephMonDown",
"count(max by (ceph_daemon) (ceph_mon_quorum_status == 0)) > 0",
Some("5m"),
"critical",
"Ceph monitor is out of quorum",
"One or more Ceph monitors are not in quorum. Quorum loss risks cluster availability.",
),
alert(
"CephMgrAbsent",
"sum(max by (ceph_daemon) (ceph_mgr_status)) < 1",
Some("5m"),
"critical",
"No active Ceph manager",
"No Ceph manager daemon is currently active. Dashboards and orchestration will be unavailable.",
),
],
},
RuleGroup {
name: "ceph-osd".to_string(),
rules: vec![
alert(
"CephOSDDown",
"count(max by (ceph_daemon) (ceph_osd_up == 0)) > 0",
Some("5m"),
"warning",
"One or more Ceph OSDs are down",
"At least one OSD daemon is reporting down for 5 minutes. Data redundancy may be reduced.",
),
alert(
"CephOSDNearFull",
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 80",
Some("15m"),
"warning",
"Ceph OSD is near full",
"OSD {{ $labels.ceph_daemon }} is above 80% utilization. Rebalance or add capacity.",
),
alert(
"CephOSDFull",
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 90",
Some("5m"),
"critical",
"Ceph OSD is critically full",
"OSD {{ $labels.ceph_daemon }} is above 90% utilization. Writes may block. Act immediately.",
),
],
},
RuleGroup {
name: "ceph-capacity".to_string(),
rules: vec![
alert(
"CephClusterNearFull",
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 75",
Some("15m"),
"warning",
"Ceph cluster is near full",
"Cluster raw utilization is above 75% for 15 minutes.",
),
alert(
"CephClusterCriticallyFull",
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 85",
Some("5m"),
"critical",
"Ceph cluster is critically full",
"Cluster raw utilization is above 85%. Imminent risk of write unavailability.",
),
alert(
"CephPoolNearFull",
"100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail)) > 80",
Some("15m"),
"warning",
"Ceph pool is near full",
"Pool (pool_id {{ $labels.pool_id }}) is above 80% usage.",
),
alert(
"CephDaysUntilFull",
"(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)) / clamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1) / 86400 < 30",
Some("1h"),
"warning",
"Ceph cluster predicted to fill within 30 days",
"Based on the 7-day usage trend, the cluster will reach capacity in less than 30 days.",
),
],
},
RuleGroup {
name: "ceph-placement-groups".to_string(),
rules: vec![
alert(
"CephPGsNotActiveClean",
"max(ceph_pg_total) - max(ceph_pg_clean) > 0",
Some("15m"),
"warning",
"Some placement groups are not active+clean",
"{{ $value }} PGs have been in a non-clean state for more than 15 minutes.",
),
alert(
"CephSlowOps",
"max(ceph_healthcheck_slow_ops) > 0",
Some("5m"),
"warning",
"Ceph reports slow ops",
"Ceph has {{ $value }} slow operations outstanding for more than 5 minutes.",
),
],
},
RuleGroup {
name: "ceph-nodes".to_string(),
rules: vec![alert(
"CephNodeRootDiskUsage",
"100 * (1 - (max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}) / max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}))) > 85",
Some("10m"),
"warning",
"Ceph node root/var disk above 85%",
"Node {{ $labels.instance }} mountpoint {{ $labels.mountpoint }} is above 85% disk usage. OSDs on this node may be at risk.",
)],
},
]
}
fn alert(
name: &str,
expr: &str,
for_: Option<&str>,
severity: &str,
summary: &str,
description: &str,
) -> Rule {
let mut labels = BTreeMap::new();
labels.insert("severity".to_string(), severity.to_string());
let mut annotations = BTreeMap::new();
annotations.insert("summary".to_string(), summary.to_string());
annotations.insert("description".to_string(), description.to_string());
Rule {
alert: Some(name.to_string()),
expr: Some(expr.to_string()),
for_: for_.map(|s| s.to_string()),
labels: Some(labels),
annotations: Some(annotations),
}
}

View File

@@ -0,0 +1,674 @@
{
"title": "Ceph Cluster",
"uid": "ceph-cluster",
"schemaVersion": 36,
"version": 1,
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"templating": {
"list": [
{
"name": "pool",
"type": "query",
"label": "Pool",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"query": { "query": "label_values(ceph_pool_metadata, name)", "refId": "Pool" },
"definition": "label_values(ceph_pool_metadata, name)",
"multi": true,
"includeAll": true,
"current": { "text": "All", "value": "$__all", "selected": false },
"refresh": 1,
"sort": 1
},
{
"name": "osd",
"type": "query",
"label": "OSD",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"query": { "query": "label_values(ceph_osd_metadata, ceph_daemon)", "refId": "OSD" },
"definition": "label_values(ceph_osd_metadata, ceph_daemon)",
"multi": true,
"includeAll": true,
"current": { "text": "All", "value": "$__all", "selected": false },
"refresh": 1,
"sort": 1
}
]
},
"panels": [
{
"type": "row", "id": 1, "title": "Cluster Status", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
},
{
"type": "stat", "id": 2, "title": "Health",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "max(ceph_health_status)", "refId": "A" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]},
"mappings": [{
"type": "value",
"options": {
"0": { "text": "HEALTH_OK", "index": 0 },
"1": { "text": "HEALTH_WARN", "index": 1 },
"2": { "text": "HEALTH_ERR", "index": 2 }
}
}]
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "value"
},
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
},
{
"type": "stat", "id": 3, "title": "Mon Quorum",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "count(max by (ceph_daemon) (ceph_mon_quorum_status == 1)) or vector(0)", "refId": "A", "legendFormat": "In Quorum" },
{ "expr": "count(max by (ceph_daemon) (ceph_mon_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" }
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
},
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
},
{
"type": "stat", "id": 4, "title": "MGR Active",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "sum(max by (ceph_daemon) (ceph_mgr_status)) or vector(0)", "refId": "A" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto"
},
"gridPos": { "h": 5, "w": 3, "x": 8, "y": 1 }
},
{
"type": "stat", "id": 5, "title": "OSDs Up / In / Total",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" },
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_in)) or vector(0)", "refId": "B", "legendFormat": "In" },
{ "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "C", "legendFormat": "Total" }
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
},
"gridPos": { "h": 5, "w": 5, "x": 11, "y": 1 }
},
{
"type": "stat", "id": 6, "title": "Pools",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(max by (pool_id) (ceph_pool_metadata)) or vector(0)", "refId": "A" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto"
},
"gridPos": { "h": 5, "w": 3, "x": 16, "y": 1 }
},
{
"type": "stat", "id": 7, "title": "PGs Active+Clean / Total",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max(ceph_pg_clean) or vector(0)", "refId": "A", "legendFormat": "Active+Clean" },
{ "expr": "max(ceph_pg_total) or vector(0)", "refId": "B", "legendFormat": "Total" }
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
},
"gridPos": { "h": 5, "w": 5, "x": 19, "y": 1 }
},
{
"type": "row", "id": 8, "title": "Capacity", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
},
{
"type": "gauge", "id": 9, "title": "Cluster Used (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{
"expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)",
"refId": "A"
}],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"showThresholdLabels": true, "showThresholdMarkers": true
},
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
},
{
"type": "stat", "id": 10, "title": "Total / Used / Available",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" },
{ "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "C", "legendFormat": "Available" }
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical"
},
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
},
{
"type": "timeseries", "id": 11, "title": "Capacity Over Time",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" }
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 }
},
{
"type": "stat", "id": 12, "title": "Days Until Full (predicted, 7d trend)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{
"expr": "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes))\n/\nclamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1)\n/ 86400",
"refId": "A"
}],
"fieldConfig": {
"defaults": {
"unit": "d",
"decimals": 1,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 14 },
{ "color": "green", "value": 60 }
]}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto"
},
"gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 }
},
{
"type": "bargauge", "id": 13, "title": "Pool Used (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{
"expr": "(\n 100 * max by (pool_id) (ceph_pool_bytes_used)\n /\n (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))\n)\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
"refId": "A",
"legendFormat": "{{name}}",
"instant": true
}],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]}
}
},
"options": {
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] },
"displayMode": "gradient",
"showUnfilled": true,
"valueMode": "color",
"sortBy": "Value",
"sortOrder": "desc"
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }
},
{
"type": "bargauge", "id": 14, "title": "OSD Utilization (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{
"expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})",
"refId": "A",
"legendFormat": "{{ceph_daemon}}"
}],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]}
}
},
"options": {
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] },
"displayMode": "gradient",
"showUnfilled": true
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }
},
{
"type": "row", "id": 15, "title": "Performance", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }
},
{
"type": "timeseries", "id": 16, "title": "Cluster IOPS (Read / Write)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd[5m])))", "refId": "A", "legendFormat": "Read" },
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr[5m])))", "refId": "B", "legendFormat": "Write" }
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }
},
{
"type": "timeseries", "id": 17, "title": "Cluster Throughput (Read / Write)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd_bytes[5m])))", "refId": "A", "legendFormat": "Read" },
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr_bytes[5m])))", "refId": "B", "legendFormat": "Write" }
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }
},
{
"type": "timeseries", "id": 18, "title": "Client Op Latency (Avg)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum(rate(ceph_osd_op_r_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_r_latency_count[5m])), 1)",
"refId": "A", "legendFormat": "Read"
},
{
"expr": "sum(rate(ceph_osd_op_w_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_w_latency_count[5m])), 1)",
"refId": "B", "legendFormat": "Write"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }
},
{
"type": "timeseries", "id": 19, "title": "Recovery Throughput",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "sum(rate(ceph_osd_recovery_bytes[5m])) or vector(0)", "refId": "A", "legendFormat": "Recovery B/s" },
{ "expr": "sum(rate(ceph_osd_recovery_ops[5m])) or vector(0)", "refId": "B", "legendFormat": "Recovery ops/s" }
],
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "Recovery B/s" }, "properties": [{ "id": "unit", "value": "Bps" }] },
{ "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] }
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }
},
{
"type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 }
},
{
"type": "timeseries", "id": 21, "title": "PG States Over Time",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max(ceph_pg_clean)", "refId": "A", "legendFormat": "clean" },
{ "expr": "max(ceph_pg_active)", "refId": "B", "legendFormat": "active" },
{ "expr": "max(ceph_pg_degraded)", "refId": "C", "legendFormat": "degraded" },
{ "expr": "max(ceph_pg_undersized)", "refId": "D", "legendFormat": "undersized" },
{ "expr": "max(ceph_pg_peering)", "refId": "E", "legendFormat": "peering" },
{ "expr": "max(ceph_pg_recovering)", "refId": "F", "legendFormat": "recovering" },
{ "expr": "max(ceph_pg_backfilling)", "refId": "G", "legendFormat": "backfilling" },
{ "expr": "max(ceph_pg_remapped)", "refId": "H", "legendFormat": "remapped" },
{ "expr": "max(ceph_pg_inconsistent)", "refId": "I", "legendFormat": "inconsistent" },
{ "expr": "max(ceph_pg_stale)", "refId": "J", "legendFormat": "stale" },
{ "expr": "max(ceph_pg_unknown)", "refId": "K", "legendFormat": "unknown" }
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 0 }
}
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["max", "lastNotNull"],
"showLegend": true,
"sortBy": "Max",
"sortDesc": true
}
},
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 }
},
{
"type": "stat", "id": 22, "title": "Slow Ops",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "max(ceph_healthcheck_slow_ops) or vector(0)", "refId": "A" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 10 }
]}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "area", "textMode": "auto"
},
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 }
},
{
"type": "stat", "id": 23, "title": "Misplaced / Degraded Objects",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max(ceph_num_objects_misplaced) or vector(0)", "refId": "A", "legendFormat": "Misplaced" },
{ "expr": "max(ceph_num_objects_degraded) or vector(0)", "refId": "B", "legendFormat": "Degraded" }
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
},
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 }
},
{
"type": "row", "id": 24, "title": "OSD Detail", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 }
},
{
"type": "table", "id": 25, "title": "OSDs",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max by (ceph_daemon) (ceph_osd_up{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "Up", "format": "table", "instant": true },
{ "expr": "max by (ceph_daemon) (ceph_osd_in{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "In", "format": "table", "instant": true },
{ "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", "refId": "C", "format": "table", "instant": true },
{ "expr": "max by (ceph_daemon) (ceph_osd_numpg{ceph_daemon=~\"$osd\"})", "refId": "D", "format": "table", "instant": true },
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "E", "format": "table", "instant": true },
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "F", "format": "table", "instant": true }
],
"transformations": [
{ "id": "merge" },
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
"renameByName": {
"ceph_daemon": "OSD",
"Value #A": "Up",
"Value #B": "In",
"Value #C": "Util %",
"Value #D": "PGs",
"Value #E": "Apply Latency",
"Value #F": "Commit Latency"
},
"indexByName": {
"OSD": 0, "Up": 1, "In": 2, "Util %": 3, "PGs": 4, "Apply Latency": 5, "Commit Latency": 6
}
}
}
],
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Util %" },
"properties": [
{ "id": "unit", "value": "percent" },
{ "id": "decimals", "value": 1 },
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]}}
]
},
{ "matcher": { "id": "byName", "options": "Apply Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
{ "matcher": { "id": "byName", "options": "Commit Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
{
"matcher": { "id": "byRegexp", "options": "Up|In" },
"properties": [
{ "id": "mappings", "value": [{ "type": "value", "options": { "0": { "text": "✗", "index": 0 }, "1": { "text": "✓", "index": 1 }}}] },
{ "id": "custom.cellOptions", "value": { "type": "color-text" } },
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]}}
]
}
]
},
"gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 }
},
{
"type": "timeseries", "id": 26, "title": "OSD Apply + Commit Latency",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "{{ceph_daemon}} apply" },
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "{{ceph_daemon}} commit" }
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 1, "fillOpacity": 0 }
}
},
"gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 }
},
{
"type": "row", "id": 27, "title": "Pool Detail", "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 }
},
{
"type": "table", "id": 28, "title": "Pools",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{ "expr": "max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", "refId": "A", "format": "table", "instant": true },
{ "expr": "max by (pool_id) (ceph_pool_objects)", "refId": "B", "format": "table", "instant": true },
{ "expr": "max by (pool_id) (ceph_pool_bytes_used)", "refId": "C", "format": "table", "instant": true },
{ "expr": "max by (pool_id) (ceph_pool_max_avail)", "refId": "D", "format": "table", "instant": true },
{ "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "E", "format": "table", "instant": true }
],
"transformations": [
{ "id": "merge" },
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "Value #A": true },
"renameByName": {
"pool_id": "ID",
"name": "Pool",
"Value #B": "Objects",
"Value #C": "Used",
"Value #D": "Available",
"Value #E": "Used %"
},
"indexByName": { "ID": 0, "Pool": 1, "Objects": 2, "Used": 3, "Available": 4, "Used %": 5 }
}
}
],
"fieldConfig": {
"defaults": {},
"overrides": [
{ "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] },
{ "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] },
{
"matcher": { "id": "byName", "options": "Used %" },
"properties": [
{ "id": "unit", "value": "percent" },
{ "id": "decimals", "value": 1 },
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]}}
]
}
]
},
"gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 }
},
{
"type": "timeseries", "id": 29, "title": "Pool IOPS (Read / Write) — filtered by $pool",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
"refId": "A", "legendFormat": "Read — {{name}}"
},
{
"expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
"refId": "B", "legendFormat": "Write — {{name}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["max", "lastNotNull"],
"showLegend": true,
"sortBy": "Max",
"sortDesc": true
}
},
"gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 }
}
]
}

View File

@@ -368,7 +368,7 @@
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))", "expr": "100 * (1 - (\n sum(node_filesystem_avail_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum(node_filesystem_size_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
"refId": "A", "refId": "A",
"legendFormat": "Disk" "legendFormat": "Disk"
} }

View File

@@ -440,7 +440,7 @@
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{instance}}"
} }
@@ -467,7 +467,7 @@
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{instance}}"
} }

View File

@@ -1,6 +1,6 @@
{ {
"title": "Storage Health", "title": "Persistent Storage",
"uid": "storage-health", "uid": "persistent-storage",
"schemaVersion": 36, "schemaVersion": 36,
"version": 1, "version": 1,
"refresh": "30s", "refresh": "30s",
@@ -21,25 +21,17 @@
"title": "Bound PVCs", "title": "Bound PVCs",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A" }
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
"refId": "A"
}
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "mode": "thresholds" }, "color": { "mode": "thresholds" },
"thresholds": { "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
}
} }
}, },
"options": { "options": {
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "colorMode": "background", "graphMode": "none", "textMode": "auto"
"graphMode": "none",
"textMode": "auto"
}, },
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
}, },
@@ -50,28 +42,19 @@
"title": "Pending PVCs", "title": "Pending PVCs",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "A" }
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
"refId": "A"
}
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "mode": "thresholds" }, "color": { "mode": "thresholds" },
"thresholds": { "thresholds": { "mode": "absolute", "steps": [
"mode": "absolute", { "color": "green", "value": null }, { "color": "yellow", "value": 1 }
"steps": [ ]}
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 }
]
}
} }
}, },
"options": { "options": {
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "colorMode": "background", "graphMode": "none", "textMode": "auto"
"graphMode": "none",
"textMode": "auto"
}, },
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
}, },
@@ -82,28 +65,19 @@
"title": "Lost PVCs", "title": "Lost PVCs",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "A" }
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
"refId": "A"
}
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "mode": "thresholds" }, "color": { "mode": "thresholds" },
"thresholds": { "thresholds": { "mode": "absolute", "steps": [
"mode": "absolute", { "color": "green", "value": null }, { "color": "red", "value": 1 }
"steps": [ ]}
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
}
} }
}, },
"options": { "options": {
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "colorMode": "background", "graphMode": "none", "textMode": "auto"
"graphMode": "none",
"textMode": "auto"
}, },
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 } "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
}, },
@@ -114,201 +88,57 @@
"title": "Bound PVs / Available PVs", "title": "Bound PVs / Available PVs",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", "refId": "B", "legendFormat": "Available" }
"refId": "A",
"legendFormat": "Bound"
},
{
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
"refId": "B",
"legendFormat": "Available"
}
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"color": { "mode": "thresholds" }, "color": { "mode": "thresholds" },
"thresholds": { "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
"mode": "absolute",
"steps": [{ "color": "blue", "value": null }]
}
} }
}, },
"options": { "options": {
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "colorMode": "background", "graphMode": "none", "textMode": "auto"
"graphMode": "none",
"textMode": "auto"
}, },
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 } "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
}, },
{ {
"type": "stat", "type": "piechart",
"id": 6, "id": 6,
"title": "Ceph Cluster Health", "title": "PVC Phase Distribution",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
"expr": "ceph_health_status", { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "B", "legendFormat": "Pending" },
"refId": "A" { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "C", "legendFormat": "Lost" }
}
], ],
"fieldConfig": { "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" } } },
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
},
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "HEALTH_OK", "index": 0 },
"1": { "text": "HEALTH_WARN", "index": 1 },
"2": { "text": "HEALTH_ERR", "index": 2 }
}
}
]
}
},
"options": { "options": {
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background", "pieType": "pie",
"graphMode": "none", "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
"textMode": "value"
}, },
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 } "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
},
{
"type": "stat",
"id": 7,
"title": "OSDs Up / Total",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum(ceph_osd_up) or vector(0)",
"refId": "A",
"legendFormat": "Up"
},
{
"expr": "count(ceph_osd_metadata) or vector(0)",
"refId": "B",
"legendFormat": "Total"
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
}, },
{ {
"type": "row", "type": "row",
"id": 8, "id": 7,
"title": "Cluster Capacity", "title": "Capacity by Storage Class",
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
}, },
{
"type": "gauge",
"id": 9,
"title": "Ceph Cluster Used (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"showThresholdLabels": true,
"showThresholdMarkers": true
},
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
},
{
"type": "stat",
"id": 10,
"title": "Ceph Capacity — Total / Available",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "ceph_cluster_total_bytes",
"refId": "A",
"legendFormat": "Total"
},
{
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
"refId": "B",
"legendFormat": "Available"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes",
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "blue", "value": null }]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] },
"colorMode": "value",
"graphMode": "none",
"textMode": "auto",
"orientation": "vertical"
},
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
},
{ {
"type": "bargauge", "type": "bargauge",
"id": 11, "id": 8,
"title": "PV Allocated Capacity by Storage Class (Bound)", "title": "PV Allocated Capacity by Storage Class (Bound)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)", "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=\"Bound\"} == 1)\n * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info\n)",
"refId": "A", "refId": "A",
"legendFormat": "{{storageclass}}" "legendFormat": "{{storageclass}}"
} }
@@ -316,11 +146,7 @@
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "bytes", "unit": "bytes",
"color": { "mode": "palette-classic" }, "color": { "mode": "palette-classic" }
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "blue", "value": null }]
}
} }
}, },
"options": { "options": {
@@ -329,267 +155,214 @@
"displayMode": "gradient", "displayMode": "gradient",
"showUnfilled": true "showUnfilled": true
}, },
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 } "gridPos": { "h": 8, "w": 8, "x": 0, "y": 7 }
}, },
{ {
"type": "piechart", "type": "bargauge",
"id": 12, "id": 9,
"title": "PVC Phase Distribution", "title": "PVC Count by Storage Class",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "expr": "count by (storageclass) (kube_persistentvolumeclaim_info{storageclass!=\"\"})",
"refId": "A", "refId": "A",
"legendFormat": "Bound" "legendFormat": "{{storageclass}}"
},
{
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
"refId": "B",
"legendFormat": "Pending"
},
{
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
"refId": "C",
"legendFormat": "Lost"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "color": { "mode": "palette-classic" } } "defaults": {
"unit": "short",
"color": { "mode": "palette-classic" }
}
}, },
"options": { "options": {
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"pieType": "pie", "displayMode": "gradient",
"legend": { "showUnfilled": true
"displayMode": "table", },
"placement": "right", "gridPos": { "h": 8, "w": 8, "x": 8, "y": 7 }
"values": ["value", "percent"] },
{
"type": "table",
"id": 10,
"title": "Storage Classes Summary",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count by (storageclass) (kube_persistentvolume_info)",
"refId": "A",
"legendFormat": "PVs",
"format": "table",
"instant": true
},
{
"expr": "sum by (storageclass) (kube_persistentvolume_capacity_bytes * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info)",
"refId": "B",
"legendFormat": "Capacity",
"format": "table",
"instant": true
} }
],
"transformations": [
{ "id": "merge" },
{
"id": "organize",
"options": {
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true },
"renameByName": { "storageclass": "StorageClass", "Value #A": "PV Count", "Value #B": "Total Capacity" }
}
}
],
"fieldConfig": {
"defaults": {},
"overrides": [
{ "matcher": { "id": "byName", "options": "Total Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] }
]
}, },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 } "gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
}, },
{ {
"type": "row", "type": "row",
"id": 13, "id": 11,
"title": "Ceph Performance", "title": "PVC Usage (kubelet volume stats)",
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 } "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
}, },
{ {
"type": "timeseries", "type": "table",
"id": 14, "id": 12,
"title": "Ceph Pool IOPS (Read / Write)", "title": "Top 20 PVCs by % Used",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "rate(ceph_pool_rd[5m])", "expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
"refId": "A", "refId": "A",
"legendFormat": "Read — pool {{pool_id}}" "format": "table",
}, "instant": true
}
],
"transformations": [
{ {
"expr": "rate(ceph_pool_wr[5m])", "id": "organize",
"refId": "B", "options": {
"legendFormat": "Write — pool {{pool_id}}" "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
"renameByName": {
"namespace": "Namespace",
"persistentvolumeclaim": "PVC",
"Value": "Used %"
},
"indexByName": { "Namespace": 0, "PVC": 1, "Used %": 2 }
}
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {},
"unit": "ops", "overrides": [
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
},
{ {
"type": "timeseries", "matcher": { "id": "byName", "options": "Used %" },
"id": 15, "properties": [
"title": "Ceph Pool Throughput (Read / Write)", { "id": "unit", "value": "percent" },
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, { "id": "decimals", "value": 1 },
"targets": [
{ {
"expr": "rate(ceph_pool_rd_bytes[5m])", "id": "custom.cellOptions",
"refId": "A", "value": { "type": "color-background", "mode": "gradient" }
"legendFormat": "Read — pool {{pool_id}}"
}, },
{ {
"expr": "rate(ceph_pool_wr_bytes[5m])", "id": "thresholds",
"refId": "B", "value": {
"legendFormat": "Write — pool {{pool_id}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
},
{
"type": "row",
"id": 16,
"title": "Ceph OSD & Pool Details",
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
},
{
"type": "timeseries",
"id": 17,
"title": "Ceph Pool Space Used (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
"refId": "A",
"legendFormat": "Pool {{pool_id}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "palette-classic" },
"thresholds": {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{ "color": "green", "value": null }, { "color": "green", "value": null },
{ "color": "yellow", "value": 70 }, { "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 } { "color": "red", "value": 85 }
] ]
},
"custom": { "lineWidth": 2, "fillOpacity": 10 }
} }
}
]
}
]
}, },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 } "gridPos": { "h": 10, "w": 12, "x": 0, "y": 16 }
}, },
{ {
"type": "bargauge", "type": "bargauge",
"id": 18, "id": 13,
"title": "OSD Status per Daemon (green = Up, red = Down)", "title": "Top 20 PVCs by Used Bytes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [ "targets": [
{ {
"expr": "ceph_osd_up", "expr": "topk(20, max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes))",
"refId": "A", "refId": "A",
"legendFormat": "{{ceph_daemon}}" "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"min": 0, "unit": "bytes",
"max": 1, "color": { "mode": "palette-classic" }
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "DOWN", "index": 0 },
"1": { "text": "UP", "index": 1 }
}
}
]
}
},
"options": {
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] },
"displayMode": "basic",
"showUnfilled": true
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
},
{
"type": "row",
"id": 19,
"title": "Node Disk Usage",
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
},
{
"type": "timeseries",
"id": 20,
"title": "Node Root Disk Usage Over Time (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "palette-classic" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
},
"custom": { "lineWidth": 2, "fillOpacity": 10 }
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
},
{
"type": "bargauge",
"id": 21,
"title": "Current Disk Usage — All Nodes & Mountpoints",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
"refId": "A",
"legendFormat": "{{instance}} — {{mountpoint}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
}
} }
}, },
"options": { "options": {
"orientation": "horizontal", "orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] }, "reduceOptions": { "calcs": ["lastNotNull"] },
"displayMode": "gradient", "displayMode": "gradient",
"showUnfilled": true "showUnfilled": true,
"valueMode": "color",
"sortBy": "Value",
"sortOrder": "desc"
}, },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 } "gridPos": { "h": 10, "w": 12, "x": 12, "y": 16 }
},
{
"type": "timeseries",
"id": 14,
"title": "Top 5 PVCs Usage Over Time (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "topk(5,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
"refId": "A",
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 8 }
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 }
},
{
"type": "timeseries",
"id": 15,
"title": "PVC Inode Usage (%) — Top 20",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes)\n)",
"refId": "A",
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 1, "fillOpacity": 5 }
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 }
} }
] ]

View File

@@ -101,7 +101,7 @@ impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
Ok(Outcome::success(format!( Ok(Outcome::success(format!(
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
self.namespace, 8 self.namespace, 9
))) )))
} }
@@ -494,7 +494,11 @@ impl ClusterDashboardsInterpret {
include_str!("dashboards/workloads-health.json"), include_str!("dashboards/workloads-health.json"),
), ),
("okd-networking", include_str!("dashboards/networking.json")), ("okd-networking", include_str!("dashboards/networking.json")),
("storage-health", include_str!("dashboards/storage.json")), (
"persistent-storage",
include_str!("dashboards/storage.json"),
),
("ceph-cluster", include_str!("dashboards/ceph.json")),
("okd-etcd", include_str!("dashboards/etcd.json")), ("okd-etcd", include_str!("dashboards/etcd.json")),
( (
"okd-control-plane", "okd-control-plane",

View File

@@ -1,6 +1,7 @@
pub mod alert_channel; pub mod alert_channel;
pub mod alert_rule; pub mod alert_rule;
pub mod application_monitoring; pub mod application_monitoring;
pub mod ceph_alerts;
pub mod cluster_dashboards; pub mod cluster_dashboards;
pub mod grafana; pub mod grafana;
pub mod kube_prometheus; pub mod kube_prometheus;

View File

@@ -0,0 +1,114 @@
use std::collections::BTreeMap;
use async_trait::async_trait;
use harmony_types::id::Id;
use kube::api::ObjectMeta;
use serde::Serialize;
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
},
score::Score,
topology::{K8sclient, Topology},
};
#[derive(Clone, Debug, Serialize)]
pub struct OpenshiftPrometheusRuleScore {
pub namespace: String,
pub name: String,
pub rule_groups: Vec<RuleGroup>,
pub labels: Option<BTreeMap<String, String>>,
}
impl<T: Topology + K8sclient> Score<T> for OpenshiftPrometheusRuleScore {
fn name(&self) -> String {
format!(
"OpenshiftPrometheusRuleScore({}/{})",
self.namespace, self.name
)
}
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(OpenshiftPrometheusRuleInterpret {
namespace: self.namespace.clone(),
name: self.name.clone(),
rule_groups: self.rule_groups.clone(),
labels: self.labels.clone(),
})
}
}
#[derive(Debug, Clone)]
pub struct OpenshiftPrometheusRuleInterpret {
namespace: String,
name: String,
rule_groups: Vec<RuleGroup>,
labels: Option<BTreeMap<String, String>>,
}
#[async_trait]
impl<T: Topology + K8sclient> Interpret<T> for OpenshiftPrometheusRuleInterpret {
async fn execute(
&self,
_inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let labels = self.labels.clone().unwrap_or_else(default_rule_labels);
let prometheus_rule = PrometheusRule {
metadata: ObjectMeta {
name: Some(self.name.clone()),
namespace: Some(self.namespace.clone()),
labels: Some(labels),
..ObjectMeta::default()
},
spec: PrometheusRuleSpec {
groups: self.rule_groups.clone(),
},
};
let client = topology
.k8s_client()
.await
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
client
.apply(&prometheus_rule, Some(&self.namespace))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"PrometheusRule '{}' applied to namespace '{}' with {} rule group(s)",
self.name,
self.namespace,
self.rule_groups.len()
)))
}
fn get_name(&self) -> InterpretName {
InterpretName::Custom("OpenshiftPrometheusRule")
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
fn default_rule_labels() -> BTreeMap<String, String> {
let mut labels = BTreeMap::new();
labels.insert("prometheus".to_string(), "k8s".to_string());
labels.insert("role".to_string(), "alert-rules".to_string());
labels
}

View File

@@ -1,5 +1,6 @@
use crate::topology::oberservability::monitoring::AlertSender; use crate::topology::oberservability::monitoring::AlertSender;
pub mod cluster_alert_rules;
pub mod cluster_monitoring; pub mod cluster_monitoring;
pub(crate) mod config; pub(crate) mod config;
pub mod enable_user_workload; pub mod enable_user_workload;