feat(monitoring): Ceph alerts integrated with OKD's native alerting stack #265
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -2873,6 +2873,17 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-okd-ceph-alerts"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-okd-cluster-alerts"
|
||||
version = "0.1.0"
|
||||
|
||||
14
examples/okd_ceph_alerts/Cargo.toml
Normal file
14
examples/okd_ceph_alerts/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "example-okd-ceph-alerts"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { workspace = true }
|
||||
log = { workspace = true }
|
||||
4
examples/okd_ceph_alerts/env.sh
Normal file
4
examples/okd_ceph_alerts/env.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
export HARMONY_SECRET_NAMESPACE=okd_ceph_alerts_example
|
||||
export HARMONY_SECRET_STORE=file
|
||||
export HARMONY_DATABASE_URL=sqlite://harmony_okd_ceph_alerts_example.sqlite
|
||||
export RUST_LOG=harmony=debug
|
||||
28
examples/okd_ceph_alerts/src/main.rs
Normal file
28
examples/okd_ceph_alerts/src/main.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::{
|
||||
ceph_alerts::ceph_alert_rule_groups, okd::cluster_alert_rules::OpenshiftPrometheusRuleScore,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let ceph_rules = OpenshiftPrometheusRuleScore {
|
||||
namespace: "rook-ceph".to_string(),
|
||||
name: "ceph-alerts".to_string(),
|
||||
rule_groups: ceph_alert_rule_groups(),
|
||||
labels: None,
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(ceph_rules)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
167
harmony/src/modules/monitoring/ceph_alerts.rs
Normal file
167
harmony/src/modules/monitoring/ceph_alerts.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{Rule, RuleGroup};
|
||||
|
||||
pub fn ceph_alert_rule_groups() -> Vec<RuleGroup> {
|
||||
vec![
|
||||
RuleGroup {
|
||||
name: "ceph-cluster-health".to_string(),
|
||||
rules: vec![
|
||||
alert(
|
||||
"CephHealthWarn",
|
||||
"max(ceph_health_status) == 1",
|
||||
Some("15m"),
|
||||
"warning",
|
||||
"Ceph cluster health is WARN",
|
||||
"Ceph reports HEALTH_WARN for more than 15 minutes. Run `ceph -s` or check the Ceph dashboard to see active health checks.",
|
||||
),
|
||||
alert(
|
||||
"CephHealthErr",
|
||||
"max(ceph_health_status) == 2",
|
||||
Some("5m"),
|
||||
"critical",
|
||||
"Ceph cluster health is ERR",
|
||||
"Ceph reports HEALTH_ERR for more than 5 minutes. Immediate investigation required.",
|
||||
),
|
||||
alert(
|
||||
"CephMonDown",
|
||||
"count(max by (ceph_daemon) (ceph_mon_quorum_status == 0)) > 0",
|
||||
Some("5m"),
|
||||
"critical",
|
||||
"Ceph monitor is out of quorum",
|
||||
"One or more Ceph monitors are not in quorum. Quorum loss risks cluster availability.",
|
||||
),
|
||||
alert(
|
||||
"CephMgrAbsent",
|
||||
"sum(max by (ceph_daemon) (ceph_mgr_status)) < 1",
|
||||
Some("5m"),
|
||||
"critical",
|
||||
"No active Ceph manager",
|
||||
"No Ceph manager daemon is currently active. Dashboards and orchestration will be unavailable.",
|
||||
),
|
||||
],
|
||||
},
|
||||
RuleGroup {
|
||||
name: "ceph-osd".to_string(),
|
||||
rules: vec![
|
||||
alert(
|
||||
"CephOSDDown",
|
||||
"count(max by (ceph_daemon) (ceph_osd_up == 0)) > 0",
|
||||
Some("5m"),
|
||||
"warning",
|
||||
"One or more Ceph OSDs are down",
|
||||
"At least one OSD daemon is reporting down for 5 minutes. Data redundancy may be reduced.",
|
||||
),
|
||||
alert(
|
||||
"CephOSDNearFull",
|
||||
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 80",
|
||||
Some("15m"),
|
||||
"warning",
|
||||
"Ceph OSD is near full",
|
||||
"OSD {{ $labels.ceph_daemon }} is above 80% utilization. Rebalance or add capacity.",
|
||||
),
|
||||
alert(
|
||||
"CephOSDFull",
|
||||
"max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 90",
|
||||
Some("5m"),
|
||||
"critical",
|
||||
"Ceph OSD is critically full",
|
||||
"OSD {{ $labels.ceph_daemon }} is above 90% utilization. Writes may block. Act immediately.",
|
||||
),
|
||||
],
|
||||
},
|
||||
RuleGroup {
|
||||
name: "ceph-capacity".to_string(),
|
||||
rules: vec![
|
||||
alert(
|
||||
"CephClusterNearFull",
|
||||
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 75",
|
||||
Some("15m"),
|
||||
"warning",
|
||||
"Ceph cluster is near full",
|
||||
"Cluster raw utilization is above 75% for 15 minutes.",
|
||||
),
|
||||
alert(
|
||||
"CephClusterCriticallyFull",
|
||||
"100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 85",
|
||||
Some("5m"),
|
||||
"critical",
|
||||
"Ceph cluster is critically full",
|
||||
"Cluster raw utilization is above 85%. Imminent risk of write unavailability.",
|
||||
),
|
||||
alert(
|
||||
"CephPoolNearFull",
|
||||
"100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail)) > 80",
|
||||
Some("15m"),
|
||||
"warning",
|
||||
"Ceph pool is near full",
|
||||
"Pool (pool_id {{ $labels.pool_id }}) is above 80% usage.",
|
||||
),
|
||||
alert(
|
||||
"CephDaysUntilFull",
|
||||
"(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)) / clamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1) / 86400 < 30",
|
||||
Some("1h"),
|
||||
"warning",
|
||||
"Ceph cluster predicted to fill within 30 days",
|
||||
"Based on the 7-day usage trend, the cluster will reach capacity in less than 30 days.",
|
||||
),
|
||||
],
|
||||
},
|
||||
RuleGroup {
|
||||
name: "ceph-placement-groups".to_string(),
|
||||
rules: vec![
|
||||
alert(
|
||||
"CephPGsNotActiveClean",
|
||||
"max(ceph_pg_total) - max(ceph_pg_clean) > 0",
|
||||
Some("15m"),
|
||||
"warning",
|
||||
"Some placement groups are not active+clean",
|
||||
"{{ $value }} PGs have been in a non-clean state for more than 15 minutes.",
|
||||
),
|
||||
alert(
|
||||
"CephSlowOps",
|
||||
"max(ceph_healthcheck_slow_ops) > 0",
|
||||
Some("5m"),
|
||||
"warning",
|
||||
"Ceph reports slow ops",
|
||||
"Ceph has {{ $value }} slow operations outstanding for more than 5 minutes.",
|
||||
),
|
||||
],
|
||||
},
|
||||
RuleGroup {
|
||||
name: "ceph-nodes".to_string(),
|
||||
rules: vec![alert(
|
||||
"CephNodeRootDiskUsage",
|
||||
"100 * (1 - (max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}) / max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}))) > 85",
|
||||
Some("10m"),
|
||||
"warning",
|
||||
"Ceph node root/var disk above 85%",
|
||||
"Node {{ $labels.instance }} mountpoint {{ $labels.mountpoint }} is above 85% disk usage. OSDs on this node may be at risk.",
|
||||
)],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
fn alert(
|
||||
name: &str,
|
||||
expr: &str,
|
||||
for_: Option<&str>,
|
||||
severity: &str,
|
||||
summary: &str,
|
||||
description: &str,
|
||||
) -> Rule {
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert("severity".to_string(), severity.to_string());
|
||||
|
||||
let mut annotations = BTreeMap::new();
|
||||
annotations.insert("summary".to_string(), summary.to_string());
|
||||
annotations.insert("description".to_string(), description.to_string());
|
||||
|
||||
Rule {
|
||||
alert: Some(name.to_string()),
|
||||
expr: Some(expr.to_string()),
|
||||
for_: for_.map(|s| s.to_string()),
|
||||
labels: Some(labels),
|
||||
annotations: Some(annotations),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,674 @@
|
||||
{
|
||||
"title": "Ceph Cluster",
|
||||
"uid": "ceph-cluster",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "pool",
|
||||
"type": "query",
|
||||
"label": "Pool",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ceph_pool_metadata, name)", "refId": "Pool" },
|
||||
"definition": "label_values(ceph_pool_metadata, name)",
|
||||
"multi": true,
|
||||
"includeAll": true,
|
||||
"current": { "text": "All", "value": "$__all", "selected": false },
|
||||
"refresh": 1,
|
||||
"sort": 1
|
||||
},
|
||||
{
|
||||
"name": "osd",
|
||||
"type": "query",
|
||||
"label": "OSD",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ceph_osd_metadata, ceph_daemon)", "refId": "OSD" },
|
||||
"definition": "label_values(ceph_osd_metadata, ceph_daemon)",
|
||||
"multi": true,
|
||||
"includeAll": true,
|
||||
"current": { "text": "All", "value": "$__all", "selected": false },
|
||||
"refresh": 1,
|
||||
"sort": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row", "id": 1, "title": "Cluster Status", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 2, "title": "Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(ceph_health_status)", "refId": "A" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]},
|
||||
"mappings": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 3, "title": "Mon Quorum",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "count(max by (ceph_daemon) (ceph_mon_quorum_status == 1)) or vector(0)", "refId": "A", "legendFormat": "In Quorum" },
|
||||
{ "expr": "count(max by (ceph_daemon) (ceph_mon_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 4, "title": "MGR Active",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(max by (ceph_daemon) (ceph_mgr_status)) or vector(0)", "refId": "A" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 3, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 5, "title": "OSDs Up / In / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" },
|
||||
{ "expr": "sum(max by (ceph_daemon) (ceph_osd_in)) or vector(0)", "refId": "B", "legendFormat": "In" },
|
||||
{ "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "C", "legendFormat": "Total" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 5, "x": 11, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 6, "title": "Pools",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(max by (pool_id) (ceph_pool_metadata)) or vector(0)", "refId": "A" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 3, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 7, "title": "PGs Active+Clean / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max(ceph_pg_clean) or vector(0)", "refId": "A", "legendFormat": "Active+Clean" },
|
||||
{ "expr": "max(ceph_pg_total) or vector(0)", "refId": "B", "legendFormat": "Total" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 5, "x": 19, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row", "id": 8, "title": "Capacity", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge", "id": 9, "title": "Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)",
|
||||
"refId": "A"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true, "showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 10, "title": "Total / Used / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
|
||||
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" },
|
||||
{ "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "C", "legendFormat": "Available" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 11, "title": "Capacity Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" },
|
||||
{ "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 12, "title": "Days Until Full (predicted, 7d trend)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes))\n/\nclamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1)\n/ 86400",
|
||||
"refId": "A"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "d",
|
||||
"decimals": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 14 },
|
||||
{ "color": "green", "value": 60 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge", "id": 13, "title": "Pool Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "(\n 100 * max by (pool_id) (ceph_pool_bytes_used)\n /\n (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))\n)\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{name}}",
|
||||
"instant": true
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color",
|
||||
"sortBy": "Value",
|
||||
"sortOrder": "desc"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge", "id": 14, "title": "OSD Utilization (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row", "id": 15, "title": "Performance", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 16, "title": "Cluster IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd[5m])))", "refId": "A", "legendFormat": "Read" },
|
||||
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr[5m])))", "refId": "B", "legendFormat": "Write" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 17, "title": "Cluster Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_rd_bytes[5m])))", "refId": "A", "legendFormat": "Read" },
|
||||
{ "expr": "sum(max by (pool_id) (rate(ceph_pool_wr_bytes[5m])))", "refId": "B", "legendFormat": "Write" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 18, "title": "Client Op Latency (Avg)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(ceph_osd_op_r_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_r_latency_count[5m])), 1)",
|
||||
"refId": "A", "legendFormat": "Read"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(ceph_osd_op_w_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_w_latency_count[5m])), 1)",
|
||||
"refId": "B", "legendFormat": "Write"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 19, "title": "Recovery Throughput",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum(rate(ceph_osd_recovery_bytes[5m])) or vector(0)", "refId": "A", "legendFormat": "Recovery B/s" },
|
||||
{ "expr": "sum(rate(ceph_osd_recovery_ops[5m])) or vector(0)", "refId": "B", "legendFormat": "Recovery ops/s" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Recovery B/s" }, "properties": [{ "id": "unit", "value": "Bps" }] },
|
||||
{ "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] }
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 21, "title": "PG States Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max(ceph_pg_clean)", "refId": "A", "legendFormat": "clean" },
|
||||
{ "expr": "max(ceph_pg_active)", "refId": "B", "legendFormat": "active" },
|
||||
{ "expr": "max(ceph_pg_degraded)", "refId": "C", "legendFormat": "degraded" },
|
||||
{ "expr": "max(ceph_pg_undersized)", "refId": "D", "legendFormat": "undersized" },
|
||||
{ "expr": "max(ceph_pg_peering)", "refId": "E", "legendFormat": "peering" },
|
||||
{ "expr": "max(ceph_pg_recovering)", "refId": "F", "legendFormat": "recovering" },
|
||||
{ "expr": "max(ceph_pg_backfilling)", "refId": "G", "legendFormat": "backfilling" },
|
||||
{ "expr": "max(ceph_pg_remapped)", "refId": "H", "legendFormat": "remapped" },
|
||||
{ "expr": "max(ceph_pg_inconsistent)", "refId": "I", "legendFormat": "inconsistent" },
|
||||
{ "expr": "max(ceph_pg_stale)", "refId": "J", "legendFormat": "stale" },
|
||||
{ "expr": "max(ceph_pg_unknown)", "refId": "K", "legendFormat": "unknown" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 0 }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["max", "lastNotNull"],
|
||||
"showLegend": true,
|
||||
"sortBy": "Max",
|
||||
"sortDesc": true
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 22, "title": "Slow Ops",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(ceph_healthcheck_slow_ops) or vector(0)", "refId": "A" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "area", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat", "id": 23, "title": "Misplaced / Degraded Objects",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max(ceph_num_objects_misplaced) or vector(0)", "refId": "A", "legendFormat": "Misplaced" },
|
||||
{ "expr": "max(ceph_num_objects_degraded) or vector(0)", "refId": "B", "legendFormat": "Degraded" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row", "id": 24, "title": "OSD Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "table", "id": 25, "title": "OSDs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_up{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "Up", "format": "table", "instant": true },
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_in{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "In", "format": "table", "instant": true },
|
||||
{ "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", "refId": "C", "format": "table", "instant": true },
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_numpg{ceph_daemon=~\"$osd\"})", "refId": "D", "format": "table", "instant": true },
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "E", "format": "table", "instant": true },
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "F", "format": "table", "instant": true }
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "merge" },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
|
||||
"renameByName": {
|
||||
"ceph_daemon": "OSD",
|
||||
"Value #A": "Up",
|
||||
"Value #B": "In",
|
||||
"Value #C": "Util %",
|
||||
"Value #D": "PGs",
|
||||
"Value #E": "Apply Latency",
|
||||
"Value #F": "Commit Latency"
|
||||
},
|
||||
"indexByName": {
|
||||
"OSD": 0, "Up": 1, "In": 2, "Util %": 3, "PGs": 4, "Apply Latency": 5, "Commit Latency": 6
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Util %" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "percent" },
|
||||
{ "id": "decimals", "value": 1 },
|
||||
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Apply Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
|
||||
{ "matcher": { "id": "byName", "options": "Commit Latency" }, "properties": [{ "id": "unit", "value": "ms" }] },
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "Up|In" },
|
||||
"properties": [
|
||||
{ "id": "mappings", "value": [{ "type": "value", "options": { "0": { "text": "✗", "index": 0 }, "1": { "text": "✓", "index": 1 }}}] },
|
||||
{ "id": "custom.cellOptions", "value": { "type": "color-text" } },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 26, "title": "OSD Apply + Commit Latency",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "{{ceph_daemon}} apply" },
|
||||
{ "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "{{ceph_daemon}} commit" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 1, "fillOpacity": 0 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row", "id": 27, "title": "Pool Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "table", "id": 28, "title": "Pools",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", "refId": "A", "format": "table", "instant": true },
|
||||
{ "expr": "max by (pool_id) (ceph_pool_objects)", "refId": "B", "format": "table", "instant": true },
|
||||
{ "expr": "max by (pool_id) (ceph_pool_bytes_used)", "refId": "C", "format": "table", "instant": true },
|
||||
{ "expr": "max by (pool_id) (ceph_pool_max_avail)", "refId": "D", "format": "table", "instant": true },
|
||||
{ "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "E", "format": "table", "instant": true }
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "merge" },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "Value #A": true },
|
||||
"renameByName": {
|
||||
"pool_id": "ID",
|
||||
"name": "Pool",
|
||||
"Value #B": "Objects",
|
||||
"Value #C": "Used",
|
||||
"Value #D": "Available",
|
||||
"Value #E": "Used %"
|
||||
},
|
||||
"indexByName": { "ID": 0, "Pool": 1, "Objects": 2, "Used": 3, "Available": 4, "Used %": 5 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||
{ "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Used %" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "percent" },
|
||||
{ "id": "decimals", "value": 1 },
|
||||
{ "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries", "id": 29, "title": "Pool IOPS (Read / Write) — filtered by $pool",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||
"refId": "A", "legendFormat": "Read — {{name}}"
|
||||
},
|
||||
{
|
||||
"expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})",
|
||||
"refId": "B", "legendFormat": "Write — {{name}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["max", "lastNotNull"],
|
||||
"showLegend": true,
|
||||
"sortBy": "Max",
|
||||
"sortDesc": true
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -368,7 +368,7 @@
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"expr": "100 * (1 - (\n sum(node_filesystem_avail_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum(node_filesystem_size_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
|
||||
@@ -440,7 +440,7 @@
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
@@ -467,7 +467,7 @@
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"title": "Persistent Storage",
|
||||
"uid": "persistent-storage",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
@@ -21,25 +21,17 @@
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
@@ -50,28 +42,19 @@
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "A" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
@@ -82,28 +65,19 @@
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "A" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null }, { "color": "red", "value": 1 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
@@ -114,201 +88,57 @@
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
{ "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
|
||||
{ "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", "refId": "B", "legendFormat": "Available" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
"colorMode": "background", "graphMode": "none", "textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"type": "piechart",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" },
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "B", "legendFormat": "Pending" },
|
||||
{ "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "C", "legendFormat": "Lost" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "palette-classic" } } },
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
"pieType": "pie",
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"id": 7,
|
||||
"title": "Capacity by Storage Class",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"id": 8,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=\"Bound\"} == 1)\n * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
@@ -316,11 +146,7 @@
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
"color": { "mode": "palette-classic" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
@@ -329,267 +155,214 @@
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"type": "bargauge",
|
||||
"id": 9,
|
||||
"title": "PVC Count by Storage Class",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"expr": "count by (storageclass) (kube_persistentvolumeclaim_info{storageclass!=\"\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"color": { "mode": "palette-classic" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "table",
|
||||
"id": 10,
|
||||
"title": "Storage Classes Summary",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (storageclass) (kube_persistentvolume_info)",
|
||||
"refId": "A",
|
||||
"legendFormat": "PVs",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by (storageclass) (kube_persistentvolume_capacity_bytes * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Capacity",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "merge" },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true },
|
||||
"renameByName": { "storageclass": "StorageClass", "Value #A": "PV Count", "Value #B": "Total Capacity" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Total Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] }
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"id": 11,
|
||||
"title": "PVC Usage (kubelet volume stats)",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"type": "table",
|
||||
"id": 12,
|
||||
"title": "Top 20 PVCs by % Used",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"persistentvolumeclaim": "PVC",
|
||||
"Value": "Used %"
|
||||
},
|
||||
"indexByName": { "Namespace": 0, "PVC": 1, "Used %": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
"matcher": { "id": "byName", "options": "Used %" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "percent" },
|
||||
{ "id": "decimals", "value": 1 },
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
"id": "custom.cellOptions",
|
||||
"value": { "type": "color-background", "mode": "gradient" }
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"id": 13,
|
||||
"title": "Top 20 PVCs by Used Bytes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"expr": "topk(20, max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color",
|
||||
"sortBy": "Value",
|
||||
"sortOrder": "desc"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Top 5 PVCs Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(5,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "PVC Inode Usage (%) — Top 20",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes)\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{persistentvolumeclaim}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 1, "fillOpacity": 5 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
|
||||
@@ -101,7 +101,7 @@ impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||
self.namespace, 8
|
||||
self.namespace, 9
|
||||
)))
|
||||
}
|
||||
|
||||
@@ -494,7 +494,11 @@ impl ClusterDashboardsInterpret {
|
||||
include_str!("dashboards/workloads-health.json"),
|
||||
),
|
||||
("okd-networking", include_str!("dashboards/networking.json")),
|
||||
("storage-health", include_str!("dashboards/storage.json")),
|
||||
(
|
||||
"persistent-storage",
|
||||
include_str!("dashboards/storage.json"),
|
||||
),
|
||||
("ceph-cluster", include_str!("dashboards/ceph.json")),
|
||||
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||
(
|
||||
"okd-control-plane",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod application_monitoring;
|
||||
pub mod ceph_alerts;
|
||||
pub mod cluster_dashboards;
|
||||
pub mod grafana;
|
||||
pub mod kube_prometheus;
|
||||
|
||||
114
harmony/src/modules/monitoring/okd/cluster_alert_rules.rs
Normal file
114
harmony/src/modules/monitoring/okd/cluster_alert_rules.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use kube::api::ObjectMeta;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||
},
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct OpenshiftPrometheusRuleScore {
|
||||
pub namespace: String,
|
||||
pub name: String,
|
||||
pub rule_groups: Vec<RuleGroup>,
|
||||
pub labels: Option<BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for OpenshiftPrometheusRuleScore {
|
||||
fn name(&self) -> String {
|
||||
format!(
|
||||
"OpenshiftPrometheusRuleScore({}/{})",
|
||||
self.namespace, self.name
|
||||
)
|
||||
}
|
||||
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(OpenshiftPrometheusRuleInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
name: self.name.clone(),
|
||||
rule_groups: self.rule_groups.clone(),
|
||||
labels: self.labels.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OpenshiftPrometheusRuleInterpret {
|
||||
namespace: String,
|
||||
name: String,
|
||||
rule_groups: Vec<RuleGroup>,
|
||||
labels: Option<BTreeMap<String, String>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for OpenshiftPrometheusRuleInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
_inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let labels = self.labels.clone().unwrap_or_else(default_rule_labels);
|
||||
|
||||
let prometheus_rule = PrometheusRule {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.name.clone()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: PrometheusRuleSpec {
|
||||
groups: self.rule_groups.clone(),
|
||||
},
|
||||
};
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
client
|
||||
.apply(&prometheus_rule, Some(&self.namespace))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"PrometheusRule '{}' applied to namespace '{}' with {} rule group(s)",
|
||||
self.name,
|
||||
self.namespace,
|
||||
self.rule_groups.len()
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("OpenshiftPrometheusRule")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
fn default_rule_labels() -> BTreeMap<String, String> {
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert("prometheus".to_string(), "k8s".to_string());
|
||||
labels.insert("role".to_string(), "alert-rules".to_string());
|
||||
labels
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::topology::oberservability::monitoring::AlertSender;
|
||||
|
||||
pub mod cluster_alert_rules;
|
||||
pub mod cluster_monitoring;
|
||||
pub(crate) mod config;
|
||||
pub mod enable_user_workload;
|
||||
|
||||
Reference in New Issue
Block a user