From 6267c2757f306f9d123ffe4e283eae814339e66c Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 15 Apr 2026 15:48:22 -0400 Subject: [PATCH 01/57] feat: Disable ipv4 address conflict detection score. This is useful when setting up bonds as the wrong mac may get a dhcp offer and then the system will perceive it as a conflict when it sets up the bond correctly --- Cargo.lock | 16 --- harmony/src/modules/okd/crd/machine_config.rs | 133 ++++++++++++++++++ harmony/src/modules/okd/crd/mod.rs | 1 + harmony/src/modules/okd/disable_dad_score.rs | 35 +++++ harmony/src/modules/okd/mod.rs | 1 + 5 files changed, 170 insertions(+), 16 deletions(-) create mode 100644 harmony/src/modules/okd/crd/machine_config.rs create mode 100644 harmony/src/modules/okd/disable_dad_score.rs diff --git a/Cargo.lock b/Cargo.lock index 4cf88ddc..b53845bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1262,22 +1262,6 @@ dependencies = [ "url", ] -[[package]] -name = "brocade-switch-oricom-configuration" -version = "0.1.0" -dependencies = [ - "async-trait", - "brocade", - "env_logger", - "harmony", - "harmony_cli", - "harmony_macros", - "harmony_types", - "log", - "serde", - "tokio", -] - [[package]] name = "brotli" version = "8.0.2" diff --git a/harmony/src/modules/okd/crd/machine_config.rs b/harmony/src/modules/okd/crd/machine_config.rs new file mode 100644 index 00000000..f0f252af --- /dev/null +++ b/harmony/src/modules/okd/crd/machine_config.rs @@ -0,0 +1,133 @@ +use std::collections::BTreeMap; + +use base64::prelude::*; +use kube::{CustomResource, api::ObjectMeta}; +use serde::{Deserialize, Serialize}; + +#[derive(CustomResource, Deserialize, Serialize, Clone, Debug, Default)] +#[kube( + group = "machineconfiguration.openshift.io", + version = "v1", + kind = "MachineConfig", + plural = "machineconfigs", + namespaced = false, + schema = "disabled" +)] +#[serde(rename_all = "camelCase")] +pub struct MachineConfigSpec { + #[serde(skip_serializing_if = "Option::is_none")] + pub config: Option, +} + +impl Default for MachineConfig { + fn default() -> Self { + Self { + metadata: ObjectMeta::default(), + spec: MachineConfigSpec::default(), + } + } +} + +#[derive(Deserialize, Serialize, Clone, Debug, Default)] +#[serde(rename_all = "camelCase")] +pub struct IgnitionConfig { + #[serde(skip_serializing_if = "Option::is_none")] + pub ignition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub storage: Option, +} + +#[derive(Deserialize, Serialize, Clone, Debug, Default)] +pub struct Ignition { + #[serde(skip_serializing_if = "Option::is_none")] + pub version: Option, +} + +#[derive(Deserialize, Serialize, Clone, Debug, Default)] +pub struct Storage { + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub files: Vec, +} + +#[derive(Deserialize, Serialize, Clone, Debug)] +pub struct IgnitionFile { + pub path: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub mode: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub overwrite: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub contents: Option, +} + +#[derive(Deserialize, Serialize, Clone, Debug)] +pub struct IgnitionFileContents { + #[serde(skip_serializing_if = "Option::is_none")] + pub source: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub compression: Option, +} + +impl MachineConfig { + pub fn disable_ipv4_dad(pool: MachineConfigPoolRole) -> Self { + let conf_content = "# Disable IPv4 Address Conflict Detection (ACD/DAD)\n\ +# Workaround for false positive conflict detection on\n\ +# 802.3ad LACP bonds where the second member's permanent\n\ +# MAC address triggers a spurious duplicate detection.\n\ +[connection]\n\ +ipv4.dad-timeout=0\n"; + + let encoded = BASE64_STANDARD.encode(conf_content); + let source = format!("data:text/plain;charset=utf-8;base64,{encoded}"); + + Self { + metadata: ObjectMeta { + name: Some(format!("99-{}-disable-dad", pool.label_value())), + labels: Some(pool.labels()), + ..Default::default() + }, + spec: MachineConfigSpec { + config: Some(IgnitionConfig { + ignition: Some(Ignition { + version: Some("3.2.0".to_string()), + }), + storage: Some(Storage { + files: vec![IgnitionFile { + path: "/etc/NetworkManager/conf.d/99-disable-ipv4-dad.conf".to_string(), + mode: Some(0o644), + overwrite: Some(true), + contents: Some(IgnitionFileContents { + source: Some(source), + compression: None, + }), + }], + }), + }), + }, + } + } +} + +#[derive(Debug, Clone, Copy, Serialize)] +pub enum MachineConfigPoolRole { + Master, + Worker, +} + +impl MachineConfigPoolRole { + pub fn label_value(&self) -> &'static str { + match self { + Self::Master => "master", + Self::Worker => "worker", + } + } + + pub fn labels(&self) -> BTreeMap { + let mut labels = BTreeMap::new(); + labels.insert( + "machineconfiguration.openshift.io/role".to_string(), + self.label_value().to_string(), + ); + labels + } +} diff --git a/harmony/src/modules/okd/crd/mod.rs b/harmony/src/modules/okd/crd/mod.rs index dae9c51e..f2af9239 100644 --- a/harmony/src/modules/okd/crd/mod.rs +++ b/harmony/src/modules/okd/crd/mod.rs @@ -1,4 +1,5 @@ pub mod ingresses_config; pub mod kubelet_config; +pub mod machine_config; pub mod nmstate; pub mod route; diff --git a/harmony/src/modules/okd/disable_dad_score.rs b/harmony/src/modules/okd/disable_dad_score.rs new file mode 100644 index 00000000..9583f30a --- /dev/null +++ b/harmony/src/modules/okd/disable_dad_score.rs @@ -0,0 +1,35 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + okd::crd::machine_config::{MachineConfig, MachineConfigPoolRole}, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct DisableDadScore { + pub pool: MachineConfigPoolRole, +} + +impl Default for DisableDadScore { + fn default() -> Self { + Self { + pool: MachineConfigPoolRole::Worker, + } + } +} + +impl Score for DisableDadScore { + fn name(&self) -> String { + "DisableDadScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let mc = MachineConfig::disable_ipv4_dad(self.pool); + K8sResourceScore::single(mc, None).create_interpret() + } +} diff --git a/harmony/src/modules/okd/mod.rs b/harmony/src/modules/okd/mod.rs index e1719e99..bd5acf96 100644 --- a/harmony/src/modules/okd/mod.rs +++ b/harmony/src/modules/okd/mod.rs @@ -25,5 +25,6 @@ pub use bootstrap_05_sanity_check::*; pub use bootstrap_06_installation_report::*; pub use bootstrap_persist_network_bond::*; pub mod crd; +pub mod disable_dad_score; pub mod host_network; pub mod system_reserved_score; -- 2.39.5 From 54ef3f70bded4add5ec97fdac864dbe736023807 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 17 Apr 2026 16:56:06 -0400 Subject: [PATCH 02/57] feat: Refactor dad score into reusable node file score using machine config --- harmony/src/modules/okd/crd/machine_config.rs | 23 +++++---- harmony/src/modules/okd/disable_dad_score.rs | 18 +++++-- harmony/src/modules/okd/mod.rs | 1 + harmony/src/modules/okd/node_file_score.rs | 49 +++++++++++++++++++ opencode.json | 3 ++ 5 files changed, 79 insertions(+), 15 deletions(-) create mode 100644 harmony/src/modules/okd/node_file_score.rs diff --git a/harmony/src/modules/okd/crd/machine_config.rs b/harmony/src/modules/okd/crd/machine_config.rs index f0f252af..d8513648 100644 --- a/harmony/src/modules/okd/crd/machine_config.rs +++ b/harmony/src/modules/okd/crd/machine_config.rs @@ -69,20 +69,19 @@ pub struct IgnitionFileContents { } impl MachineConfig { - pub fn disable_ipv4_dad(pool: MachineConfigPoolRole) -> Self { - let conf_content = "# Disable IPv4 Address Conflict Detection (ACD/DAD)\n\ -# Workaround for false positive conflict detection on\n\ -# 802.3ad LACP bonds where the second member's permanent\n\ -# MAC address triggers a spurious duplicate detection.\n\ -[connection]\n\ -ipv4.dad-timeout=0\n"; - - let encoded = BASE64_STANDARD.encode(conf_content); + pub fn with_file( + pool: MachineConfigPoolRole, + resource_name: &str, + path: &str, + content: &str, + mode: Option, + ) -> Self { + let encoded = BASE64_STANDARD.encode(content); let source = format!("data:text/plain;charset=utf-8;base64,{encoded}"); Self { metadata: ObjectMeta { - name: Some(format!("99-{}-disable-dad", pool.label_value())), + name: Some(format!("{}-{}", pool.label_value(), resource_name)), labels: Some(pool.labels()), ..Default::default() }, @@ -93,8 +92,8 @@ ipv4.dad-timeout=0\n"; }), storage: Some(Storage { files: vec![IgnitionFile { - path: "/etc/NetworkManager/conf.d/99-disable-ipv4-dad.conf".to_string(), - mode: Some(0o644), + path: path.to_string(), + mode, overwrite: Some(true), contents: Some(IgnitionFileContents { source: Some(source), diff --git a/harmony/src/modules/okd/disable_dad_score.rs b/harmony/src/modules/okd/disable_dad_score.rs index 9583f30a..dfbf7345 100644 --- a/harmony/src/modules/okd/disable_dad_score.rs +++ b/harmony/src/modules/okd/disable_dad_score.rs @@ -4,7 +4,7 @@ use crate::{ interpret::Interpret, modules::{ k8s::resource::K8sResourceScore, - okd::crd::machine_config::{MachineConfig, MachineConfigPoolRole}, + okd::{crd::machine_config::MachineConfigPoolRole, node_file_score::NodeFileScore}, }, score::Score, topology::{K8sclient, Topology}, @@ -29,7 +29,19 @@ impl Score for DisableDadScore { } fn create_interpret(&self) -> Box> { - let mc = MachineConfig::disable_ipv4_dad(self.pool); - K8sResourceScore::single(mc, None).create_interpret() + let score = NodeFileScore { + pool: self.pool, + resource_name: "disable-dad".to_string(), + path: "/etc/NetworkManager/conf.d/99-disable-ipv4-dad.conf".to_string(), + content: "# Disable IPv4 Address Conflict Detection (ACD/DAD)\n\ +# Workaround for false positive conflict detection on\n\ +# 802.3ad LACP bonds where the second member's permanent\n\ +# MAC address triggers a spurious duplicate detection.\n\ +[connection]\n\ +ipv4.dad-timeout=0\n" + .to_string(), + mode: Some(0o644), + }; + score.create_interpret() } } diff --git a/harmony/src/modules/okd/mod.rs b/harmony/src/modules/okd/mod.rs index bd5acf96..5fafe15d 100644 --- a/harmony/src/modules/okd/mod.rs +++ b/harmony/src/modules/okd/mod.rs @@ -27,4 +27,5 @@ pub use bootstrap_persist_network_bond::*; pub mod crd; pub mod disable_dad_score; pub mod host_network; +pub mod node_file_score; pub mod system_reserved_score; diff --git a/harmony/src/modules/okd/node_file_score.rs b/harmony/src/modules/okd/node_file_score.rs new file mode 100644 index 00000000..c1710e3f --- /dev/null +++ b/harmony/src/modules/okd/node_file_score.rs @@ -0,0 +1,49 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + okd::crd::machine_config::{MachineConfig, MachineConfigPoolRole}, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct NodeFileScore { + pub pool: MachineConfigPoolRole, + pub resource_name: String, + pub path: String, + pub content: String, + pub mode: Option, +} + +impl Default for NodeFileScore { + fn default() -> Self { + Self { + pool: MachineConfigPoolRole::Worker, + resource_name: "generic-file".to_string(), + path: "/etc/placeholder".to_string(), + content: "".to_string(), + mode: None, + } + } +} + +impl Score for NodeFileScore { + fn name(&self) -> String { + format!("NodeFileScore({})", self.path) + } + + fn create_interpret(&self) -> Box> { + let mc = MachineConfig::with_file( + self.pool, + &self.resource_name, + &self.path, + &self.content, + self.mode, + ); + K8sResourceScore::single(mc, None).create_interpret() + } +} diff --git a/opencode.json b/opencode.json index 536a5bfd..81b3ae2c 100644 --- a/opencode.json +++ b/opencode.json @@ -10,6 +10,9 @@ "models": { "qwen3-coder-next:q4_K_M": { "name": "qwen3-coder-next:q4_K_M" + }, + "gemma4:31b": { + "name": "Gemma 4 31b" } } }, -- 2.39.5 From 7265d8a4f3d0ede049b098469e89cc1f53076c2b Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 12:01:25 -0400 Subject: [PATCH 03/57] fix: fix ceph dashboard for root volumes not populated --- .../dashboards/cluster-overview.json | 2 +- .../dashboards/nodes-health.json | 4 +-- .../dashboards/storage.json | 28 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json index 43079ce7..201f53a7 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json @@ -368,7 +368,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))", + "expr": "100 * (1 - (\n sum(node_filesystem_avail_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum(node_filesystem_size_bytes{mountpoint=\"/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"} or node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "Disk" } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json index 0b2fe9dd..01236b23 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json @@ -440,7 +440,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "{{instance}}" } @@ -467,7 +467,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "expr": "100 * (1 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", "refId": "A", "legendFormat": "{{instance}}" } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json index 3c581842..c1473c5c 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json @@ -150,7 +150,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "ceph_health_status", + "expr": "max(ceph_health_status)", "refId": "A" } ], @@ -193,12 +193,12 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "sum(ceph_osd_up) or vector(0)", + "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" }, { - "expr": "count(ceph_osd_metadata) or vector(0)", + "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" } @@ -236,7 +236,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes", + "expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)", "refId": "A" } ], @@ -271,12 +271,12 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "ceph_cluster_total_bytes", + "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" }, { - "expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", + "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Available" } @@ -308,7 +308,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)", + "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=\"Bound\"} == 1)\n * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info\n)", "refId": "A", "legendFormat": "{{storageclass}}" } @@ -384,12 +384,12 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "rate(ceph_pool_rd[5m])", + "expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))", "refId": "A", "legendFormat": "Read — pool {{pool_id}}" }, { - "expr": "rate(ceph_pool_wr[5m])", + "expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))", "refId": "B", "legendFormat": "Write — pool {{pool_id}}" } @@ -411,12 +411,12 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "rate(ceph_pool_rd_bytes[5m])", + "expr": "max by (pool_id) (rate(ceph_pool_rd_bytes[5m]))", "refId": "A", "legendFormat": "Read — pool {{pool_id}}" }, { - "expr": "rate(ceph_pool_wr_bytes[5m])", + "expr": "max by (pool_id) (rate(ceph_pool_wr_bytes[5m]))", "refId": "B", "legendFormat": "Write — pool {{pool_id}}" } @@ -446,7 +446,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)", + "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "A", "legendFormat": "Pool {{pool_id}}" } @@ -478,7 +478,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "ceph_osd_up", + "expr": "max by (ceph_daemon) (ceph_osd_up)", "refId": "A", "legendFormat": "{{ceph_daemon}}" } @@ -530,7 +530,7 @@ "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", + "expr": "100 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n * 100\n)", "refId": "A", "legendFormat": "{{instance}}" } -- 2.39.5 From 126390bb63f26be6334a2e1496fc404940be4987 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 13:09:34 -0400 Subject: [PATCH 04/57] feat: split storage dashboard in two : ceph + persistent storage --- .../dashboards/storage.json | 623 ++++++------------ .../monitoring/cluster_dashboards/score.rs | 8 +- 2 files changed, 204 insertions(+), 427 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json index c1473c5c..dfaccf62 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json @@ -1,6 +1,6 @@ { - "title": "Storage Health", - "uid": "storage-health", + "title": "Persistent Storage", + "uid": "persistent-storage", "schemaVersion": 36, "version": 1, "refresh": "30s", @@ -21,25 +21,17 @@ "title": "Bound PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "green", "value": null }] - } + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } }, @@ -50,28 +42,19 @@ "title": "Pending PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 } - ] - } + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, { "color": "yellow", "value": 1 } + ]} } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } }, @@ -82,28 +65,19 @@ "title": "Lost PVCs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 1 } - ] - } + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, { "color": "red", "value": 1 } + ]} } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 } }, @@ -114,196 +88,52 @@ "title": "Bound PVs / Available PVs", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", - "refId": "A", - "legendFormat": "Bound" - }, - { - "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", - "refId": "B", - "legendFormat": "Available" - } + { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" }, + { "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", "refId": "B", "legendFormat": "Available" } ], "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" + "colorMode": "background", "graphMode": "none", "textMode": "auto" }, - "gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 } + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 } }, { - "type": "stat", + "type": "piechart", "id": 6, - "title": "Ceph Cluster Health", + "title": "PVC Phase Distribution", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ - { - "expr": "max(ceph_health_status)", - "refId": "A" - } + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", "refId": "A", "legendFormat": "Bound" }, + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", "refId": "B", "legendFormat": "Pending" }, + { "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", "refId": "C", "legendFormat": "Lost" } ], - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 }, - { "color": "red", "value": 2 } - ] - }, - "mappings": [ - { - "type": "value", - "options": { - "0": { "text": "HEALTH_OK", "index": 0 }, - "1": { "text": "HEALTH_WARN", "index": 1 }, - "2": { "text": "HEALTH_ERR", "index": 2 } - } - } - ] - } - }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" } } }, "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "value" + "pieType": "pie", + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] } }, - "gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 } - }, - - { - "type": "stat", - "id": 7, - "title": "OSDs Up / Total", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", - "refId": "A", - "legendFormat": "Up" - }, - { - "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", - "refId": "B", - "legendFormat": "Total" - } - ], - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "green", "value": null }] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" - }, - "gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 } + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 } }, { "type": "row", - "id": 8, - "title": "Cluster Capacity", + "id": 7, + "title": "Capacity by Storage Class", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } }, - { - "type": "gauge", - "id": 9, - "title": "Ceph Cluster Used (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "showThresholdLabels": true, - "showThresholdMarkers": true - }, - "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } - }, - - { - "type": "stat", - "id": 10, - "title": "Ceph Capacity — Total / Available", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "max(ceph_cluster_total_bytes)", - "refId": "A", - "legendFormat": "Total" - }, - { - "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", - "refId": "B", - "legendFormat": "Available" - } - ], - "fieldConfig": { - "defaults": { - "unit": "bytes", - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } - } - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"] }, - "colorMode": "value", - "graphMode": "none", - "textMode": "auto", - "orientation": "vertical" - }, - "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } - }, - { "type": "bargauge", - "id": 11, + "id": 8, "title": "PV Allocated Capacity by Storage Class (Bound)", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ @@ -316,11 +146,7 @@ "fieldConfig": { "defaults": { "unit": "bytes", - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [{ "color": "blue", "value": null }] - } + "color": { "mode": "palette-classic" } } }, "options": { @@ -329,267 +155,214 @@ "displayMode": "gradient", "showUnfilled": true }, - "gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 } + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 7 } }, { - "type": "piechart", - "id": 12, - "title": "PVC Phase Distribution", + "type": "bargauge", + "id": 9, + "title": "PVC Count by Storage Class", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "expr": "count by (storageclass) (kube_persistentvolumeclaim_info{storageclass!=\"\"})", "refId": "A", - "legendFormat": "Bound" - }, - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", - "refId": "B", - "legendFormat": "Pending" - }, - { - "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", - "refId": "C", - "legendFormat": "Lost" + "legendFormat": "{{storageclass}}" } ], "fieldConfig": { - "defaults": { "color": { "mode": "palette-classic" } } + "defaults": { + "unit": "short", + "color": { "mode": "palette-classic" } + } }, "options": { + "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"] }, - "pieType": "pie", - "legend": { - "displayMode": "table", - "placement": "right", - "values": ["value", "percent"] + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 7 } + }, + + { + "type": "table", + "id": 10, + "title": "Storage Classes Summary", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count by (storageclass) (kube_persistentvolume_info)", + "refId": "A", + "legendFormat": "PVs", + "format": "table", + "instant": true + }, + { + "expr": "sum by (storageclass) (kube_persistentvolume_capacity_bytes * on(persistentvolume) group_left(storageclass) kube_persistentvolume_info)", + "refId": "B", + "legendFormat": "Capacity", + "format": "table", + "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true }, + "renameByName": { "storageclass": "StorageClass", "Value #A": "PV Count", "Value #B": "Total Capacity" } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { "matcher": { "id": "byName", "options": "Total Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] } + ] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 } }, { "type": "row", - "id": 13, - "title": "Ceph Performance", + "id": 11, + "title": "PVC Usage (kubelet volume stats)", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 } }, { - "type": "timeseries", - "id": 14, - "title": "Ceph Pool IOPS (Read / Write)", + "type": "table", + "id": 12, + "title": "Top 20 PVCs by % Used", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, "targets": [ { - "expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))", + "expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)", "refId": "A", - "legendFormat": "Read — pool {{pool_id}}" - }, + "format": "table", + "instant": true + } + ], + "transformations": [ { - "expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))", - "refId": "B", - "legendFormat": "Write — pool {{pool_id}}" + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true }, + "renameByName": { + "namespace": "Namespace", + "persistentvolumeclaim": "PVC", + "Value": "Used %" + }, + "indexByName": { "Namespace": 0, "PVC": 1, "Used %": 2 } + } } ], "fieldConfig": { - "defaults": { - "unit": "ops", - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 8 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 } - }, - - { - "type": "timeseries", - "id": 15, - "title": "Ceph Pool Throughput (Read / Write)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "max by (pool_id) (rate(ceph_pool_rd_bytes[5m]))", - "refId": "A", - "legendFormat": "Read — pool {{pool_id}}" - }, - { - "expr": "max by (pool_id) (rate(ceph_pool_wr_bytes[5m]))", - "refId": "B", - "legendFormat": "Write — pool {{pool_id}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps", - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 8 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 } - }, - - { - "type": "row", - "id": 16, - "title": "Ceph OSD & Pool Details", - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 } - }, - - { - "type": "timeseries", - "id": 17, - "title": "Ceph Pool Space Used (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", - "refId": "A", - "legendFormat": "Pool {{pool_id}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - }, - "custom": { "lineWidth": 2, "fillOpacity": 10 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 } - }, - - { - "type": "bargauge", - "id": 18, - "title": "OSD Status per Daemon (green = Up, red = Down)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "max by (ceph_daemon) (ceph_osd_up)", - "refId": "A", - "legendFormat": "{{ceph_daemon}}" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 1, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - }, - "mappings": [ - { - "type": "value", - "options": { - "0": { "text": "DOWN", "index": 0 }, - "1": { "text": "UP", "index": 1 } + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { + "id": "custom.cellOptions", + "value": { "type": "color-background", "mode": "gradient" } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } } - } - ] - } - }, - "options": { - "orientation": "horizontal", - "reduceOptions": { "calcs": ["lastNotNull"] }, - "displayMode": "basic", - "showUnfilled": true - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 } - }, - - { - "type": "row", - "id": 19, - "title": "Node Disk Usage", - "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 } - }, - - { - "type": "timeseries", - "id": 20, - "title": "Node Root Disk Usage Over Time (%)", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 - (\n max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n * 100\n)", - "refId": "A", - "legendFormat": "{{instance}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "palette-classic" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } - ] - }, - "custom": { "lineWidth": 2, "fillOpacity": 10 } - } - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 } - }, - - { - "type": "bargauge", - "id": 21, - "title": "Current Disk Usage — All Nodes & Mountpoints", - "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)", - "refId": "A", - "legendFormat": "{{instance}} — {{mountpoint}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "color": { "mode": "thresholds" }, - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 85 } ] } + ] + }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 16 } + }, + + { + "type": "bargauge", + "id": 13, + "title": "Top 20 PVCs by Used Bytes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(20, max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes))", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" } } }, "options": { "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"] }, "displayMode": "gradient", - "showUnfilled": true + "showUnfilled": true, + "valueMode": "color", + "sortBy": "Value", + "sortOrder": "desc" }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 } + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 16 } + }, + + { + "type": "timeseries", + "id": 14, + "title": "Top 5 PVCs Usage Over Time (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(5,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes)\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 26 } + }, + + { + "type": "timeseries", + "id": 15, + "title": "PVC Inode Usage (%) — Top 20", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(20,\n 100 * max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used)\n /\n max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes)\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 5 } + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 34 } } ] diff --git a/harmony/src/modules/monitoring/cluster_dashboards/score.rs b/harmony/src/modules/monitoring/cluster_dashboards/score.rs index 22f916d7..ed52ed12 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/score.rs +++ b/harmony/src/modules/monitoring/cluster_dashboards/score.rs @@ -101,7 +101,7 @@ impl Interpret for ClusterDashboardsInterpret { Ok(Outcome::success(format!( "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", - self.namespace, 8 + self.namespace, 9 ))) } @@ -494,7 +494,11 @@ impl ClusterDashboardsInterpret { include_str!("dashboards/workloads-health.json"), ), ("okd-networking", include_str!("dashboards/networking.json")), - ("storage-health", include_str!("dashboards/storage.json")), + ( + "persistent-storage", + include_str!("dashboards/storage.json"), + ), + ("ceph-cluster", include_str!("dashboards/ceph.json")), ("okd-etcd", include_str!("dashboards/etcd.json")), ( "okd-control-plane", -- 2.39.5 From 8acd9de2754856dbda852a3374a957759850475f Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 13:52:36 -0400 Subject: [PATCH 05/57] feat: score to create ceph alerts in the okd default alerting stack --- Cargo.lock | 11 + examples/okd_ceph_alerts/Cargo.toml | 14 + examples/okd_ceph_alerts/env.sh | 4 + examples/okd_ceph_alerts/src/main.rs | 28 + harmony/src/modules/monitoring/ceph_alerts.rs | 167 +++++ .../cluster_dashboards/dashboards/ceph.json | 674 ++++++++++++++++++ harmony/src/modules/monitoring/mod.rs | 1 + .../monitoring/okd/cluster_alert_rules.rs | 114 +++ harmony/src/modules/monitoring/okd/mod.rs | 1 + 9 files changed, 1014 insertions(+) create mode 100644 examples/okd_ceph_alerts/Cargo.toml create mode 100644 examples/okd_ceph_alerts/env.sh create mode 100644 examples/okd_ceph_alerts/src/main.rs create mode 100644 harmony/src/modules/monitoring/ceph_alerts.rs create mode 100644 harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json create mode 100644 harmony/src/modules/monitoring/okd/cluster_alert_rules.rs diff --git a/Cargo.lock b/Cargo.lock index db2929dc..007854cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2873,6 +2873,17 @@ dependencies = [ "url", ] +[[package]] +name = "example-okd-ceph-alerts" +version = "0.1.0" +dependencies = [ + "harmony", + "harmony_cli", + "harmony_types", + "log", + "tokio", +] + [[package]] name = "example-okd-cluster-alerts" version = "0.1.0" diff --git a/examples/okd_ceph_alerts/Cargo.toml b/examples/okd_ceph_alerts/Cargo.toml new file mode 100644 index 00000000..7301242d --- /dev/null +++ b/examples/okd_ceph_alerts/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "example-okd-ceph-alerts" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +tokio = { workspace = true } +log = { workspace = true } diff --git a/examples/okd_ceph_alerts/env.sh b/examples/okd_ceph_alerts/env.sh new file mode 100644 index 00000000..08072655 --- /dev/null +++ b/examples/okd_ceph_alerts/env.sh @@ -0,0 +1,4 @@ +export HARMONY_SECRET_NAMESPACE=okd_ceph_alerts_example +export HARMONY_SECRET_STORE=file +export HARMONY_DATABASE_URL=sqlite://harmony_okd_ceph_alerts_example.sqlite +export RUST_LOG=harmony=debug diff --git a/examples/okd_ceph_alerts/src/main.rs b/examples/okd_ceph_alerts/src/main.rs new file mode 100644 index 00000000..33bfa1ca --- /dev/null +++ b/examples/okd_ceph_alerts/src/main.rs @@ -0,0 +1,28 @@ +use harmony::{ + inventory::Inventory, + modules::monitoring::{ + ceph_alerts::ceph_alert_rule_groups, okd::cluster_alert_rules::OpenshiftPrometheusRuleScore, + }, + topology::K8sAnywhereTopology, +}; + +#[tokio::main] +async fn main() { + harmony_cli::cli_logger::init(); + + let ceph_rules = OpenshiftPrometheusRuleScore { + namespace: "rook-ceph".to_string(), + name: "ceph-alerts".to_string(), + rule_groups: ceph_alert_rule_groups(), + labels: None, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(ceph_rules)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony/src/modules/monitoring/ceph_alerts.rs b/harmony/src/modules/monitoring/ceph_alerts.rs new file mode 100644 index 00000000..88044d75 --- /dev/null +++ b/harmony/src/modules/monitoring/ceph_alerts.rs @@ -0,0 +1,167 @@ +use std::collections::BTreeMap; + +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{Rule, RuleGroup}; + +pub fn ceph_alert_rule_groups() -> Vec { + vec![ + RuleGroup { + name: "ceph-cluster-health".to_string(), + rules: vec![ + alert( + "CephHealthWarn", + "max(ceph_health_status) == 1", + Some("15m"), + "warning", + "Ceph cluster health is WARN", + "Ceph reports HEALTH_WARN for more than 15 minutes. Run `ceph -s` or check the Ceph dashboard to see active health checks.", + ), + alert( + "CephHealthErr", + "max(ceph_health_status) == 2", + Some("5m"), + "critical", + "Ceph cluster health is ERR", + "Ceph reports HEALTH_ERR for more than 5 minutes. Immediate investigation required.", + ), + alert( + "CephMonDown", + "count(max by (ceph_daemon) (ceph_mon_quorum_status == 0)) > 0", + Some("5m"), + "critical", + "Ceph monitor is out of quorum", + "One or more Ceph monitors are not in quorum. Quorum loss risks cluster availability.", + ), + alert( + "CephMgrAbsent", + "sum(max by (ceph_daemon) (ceph_mgr_status)) < 1", + Some("5m"), + "critical", + "No active Ceph manager", + "No Ceph manager daemon is currently active. Dashboards and orchestration will be unavailable.", + ), + ], + }, + RuleGroup { + name: "ceph-osd".to_string(), + rules: vec![ + alert( + "CephOSDDown", + "count(max by (ceph_daemon) (ceph_osd_up == 0)) > 0", + Some("5m"), + "warning", + "One or more Ceph OSDs are down", + "At least one OSD daemon is reporting down for 5 minutes. Data redundancy may be reduced.", + ), + alert( + "CephOSDNearFull", + "max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 80", + Some("15m"), + "warning", + "Ceph OSD is near full", + "OSD {{ $labels.ceph_daemon }} is above 80% utilization. Rebalance or add capacity.", + ), + alert( + "CephOSDFull", + "max by (ceph_daemon) (100 * ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) > 90", + Some("5m"), + "critical", + "Ceph OSD is critically full", + "OSD {{ $labels.ceph_daemon }} is above 90% utilization. Writes may block. Act immediately.", + ), + ], + }, + RuleGroup { + name: "ceph-capacity".to_string(), + rules: vec![ + alert( + "CephClusterNearFull", + "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 75", + Some("15m"), + "warning", + "Ceph cluster is near full", + "Cluster raw utilization is above 75% for 15 minutes.", + ), + alert( + "CephClusterCriticallyFull", + "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes) > 85", + Some("5m"), + "critical", + "Ceph cluster is critically full", + "Cluster raw utilization is above 85%. Imminent risk of write unavailability.", + ), + alert( + "CephPoolNearFull", + "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail)) > 80", + Some("15m"), + "warning", + "Ceph pool is near full", + "Pool (pool_id {{ $labels.pool_id }}) is above 80% usage.", + ), + alert( + "CephDaysUntilFull", + "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)) / clamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1) / 86400 < 30", + Some("1h"), + "warning", + "Ceph cluster predicted to fill within 30 days", + "Based on the 7-day usage trend, the cluster will reach capacity in less than 30 days.", + ), + ], + }, + RuleGroup { + name: "ceph-placement-groups".to_string(), + rules: vec![ + alert( + "CephPGsNotActiveClean", + "max(ceph_pg_total) - max(ceph_pg_clean) > 0", + Some("15m"), + "warning", + "Some placement groups are not active+clean", + "{{ $value }} PGs have been in a non-clean state for more than 15 minutes.", + ), + alert( + "CephSlowOps", + "max(ceph_healthcheck_slow_ops) > 0", + Some("5m"), + "warning", + "Ceph reports slow ops", + "Ceph has {{ $value }} slow operations outstanding for more than 5 minutes.", + ), + ], + }, + RuleGroup { + name: "ceph-nodes".to_string(), + rules: vec![alert( + "CephNodeRootDiskUsage", + "100 * (1 - (max by (instance, mountpoint) (node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}) / max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"}))) > 85", + Some("10m"), + "warning", + "Ceph node root/var disk above 85%", + "Node {{ $labels.instance }} mountpoint {{ $labels.mountpoint }} is above 85% disk usage. OSDs on this node may be at risk.", + )], + }, + ] +} + +fn alert( + name: &str, + expr: &str, + for_: Option<&str>, + severity: &str, + summary: &str, + description: &str, +) -> Rule { + let mut labels = BTreeMap::new(); + labels.insert("severity".to_string(), severity.to_string()); + + let mut annotations = BTreeMap::new(); + annotations.insert("summary".to_string(), summary.to_string()); + annotations.insert("description".to_string(), description.to_string()); + + Rule { + alert: Some(name.to_string()), + expr: Some(expr.to_string()), + for_: for_.map(|s| s.to_string()), + labels: Some(labels), + annotations: Some(annotations), + } +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json new file mode 100644 index 00000000..d555511d --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -0,0 +1,674 @@ +{ + "title": "Ceph Cluster", + "uid": "ceph-cluster", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + + "templating": { + "list": [ + { + "name": "pool", + "type": "query", + "label": "Pool", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ceph_pool_metadata, name)", "refId": "Pool" }, + "definition": "label_values(ceph_pool_metadata, name)", + "multi": true, + "includeAll": true, + "current": { "text": "All", "value": "$__all", "selected": false }, + "refresh": 1, + "sort": 1 + }, + { + "name": "osd", + "type": "query", + "label": "OSD", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ceph_osd_metadata, ceph_daemon)", "refId": "OSD" }, + "definition": "label_values(ceph_osd_metadata, ceph_daemon)", + "multi": true, + "includeAll": true, + "current": { "text": "All", "value": "$__all", "selected": false }, + "refresh": 1, + "sort": 1 + } + ] + }, + + "panels": [ + + { + "type": "row", "id": 1, "title": "Cluster Status", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "type": "stat", "id": 2, "title": "Health", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(ceph_health_status)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 2 } + ]}, + "mappings": [{ + "type": "value", + "options": { + "0": { "text": "HEALTH_OK", "index": 0 }, + "1": { "text": "HEALTH_WARN", "index": 1 }, + "2": { "text": "HEALTH_ERR", "index": 2 } + } + }] + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "value" + }, + "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } + }, + + { + "type": "stat", "id": 3, "title": "Mon Quorum", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "count(max by (ceph_daemon) (ceph_mon_quorum_status == 1)) or vector(0)", "refId": "A", "legendFormat": "In Quorum" }, + { "expr": "count(max by (ceph_daemon) (ceph_mon_metadata)) or vector(0)", "refId": "B", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } + }, + + { + "type": "stat", "id": 4, "title": "MGR Active", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(max by (ceph_daemon) (ceph_mgr_status)) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 3, "x": 8, "y": 1 } + }, + + { + "type": "stat", "id": 5, "title": "OSDs Up / In / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (ceph_daemon) (ceph_osd_up)) or vector(0)", "refId": "A", "legendFormat": "Up" }, + { "expr": "sum(max by (ceph_daemon) (ceph_osd_in)) or vector(0)", "refId": "B", "legendFormat": "In" }, + { "expr": "count(max by (ceph_daemon) (ceph_osd_metadata)) or vector(0)", "refId": "C", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 5, "x": 11, "y": 1 } + }, + + { + "type": "stat", "id": 6, "title": "Pools", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(max by (pool_id) (ceph_pool_metadata)) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 3, "x": 16, "y": 1 } + }, + + { + "type": "stat", "id": 7, "title": "PGs Active+Clean / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_pg_clean) or vector(0)", "refId": "A", "legendFormat": "Active+Clean" }, + { "expr": "max(ceph_pg_total) or vector(0)", "refId": "B", "legendFormat": "Total" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 5, "w": 5, "x": 19, "y": 1 } + }, + + { + "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + + { + "type": "gauge", "id": 9, "title": "Cluster Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / max(ceph_cluster_total_bytes)", + "refId": "A" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "showThresholdLabels": true, "showThresholdMarkers": true + }, + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + }, + + { + "type": "stat", "id": 10, "title": "Total / Used / Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" }, + { "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" }, + { "expr": "max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "C", "legendFormat": "Available" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical" + }, + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + }, + + { + "type": "timeseries", "id": 11, "title": "Capacity Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_cluster_total_bytes)", "refId": "A", "legendFormat": "Total" }, + { "expr": "max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", "refId": "B", "legendFormat": "Used" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 } + }, + + { + "type": "stat", "id": 12, "title": "Days Until Full (predicted, 7d trend)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "(max(ceph_cluster_total_bytes) - max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes))\n/\nclamp_min(deriv(max(ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)[7d:1h]), 1)\n/ 86400", + "refId": "A" + }], + "fieldConfig": { + "defaults": { + "unit": "d", + "decimals": 1, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 14 }, + { "color": "green", "value": 60 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto" + }, + "gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 } + }, + + { + "type": "bargauge", "id": 13, "title": "Pool Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "(\n 100 * max by (pool_id) (ceph_pool_bytes_used)\n /\n (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))\n)\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "A", + "legendFormat": "{{name}}", + "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true, + "valueMode": "color", + "sortBy": "Value", + "sortOrder": "desc" + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 } + }, + + { + "type": "bargauge", "id": 14, "title": "OSD Utilization (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", + "refId": "A", + "legendFormat": "{{ceph_daemon}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 } + }, + + { + "type": "row", "id": 15, "title": "Performance", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 } + }, + + { + "type": "timeseries", "id": 16, "title": "Cluster IOPS (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (pool_id) (rate(ceph_pool_rd[5m])))", "refId": "A", "legendFormat": "Read" }, + { "expr": "sum(max by (pool_id) (rate(ceph_pool_wr[5m])))", "refId": "B", "legendFormat": "Write" } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 } + }, + + { + "type": "timeseries", "id": 17, "title": "Cluster Throughput (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(max by (pool_id) (rate(ceph_pool_rd_bytes[5m])))", "refId": "A", "legendFormat": "Read" }, + { "expr": "sum(max by (pool_id) (rate(ceph_pool_wr_bytes[5m])))", "refId": "B", "legendFormat": "Write" } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 } + }, + + { + "type": "timeseries", "id": 18, "title": "Client Op Latency (Avg)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(ceph_osd_op_r_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_r_latency_count[5m])), 1)", + "refId": "A", "legendFormat": "Read" + }, + { + "expr": "sum(rate(ceph_osd_op_w_latency_sum[5m])) / clamp_min(sum(rate(ceph_osd_op_w_latency_count[5m])), 1)", + "refId": "B", "legendFormat": "Write" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 } + }, + + { + "type": "timeseries", "id": 19, "title": "Recovery Throughput", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(rate(ceph_osd_recovery_bytes[5m])) or vector(0)", "refId": "A", "legendFormat": "Recovery B/s" }, + { "expr": "sum(rate(ceph_osd_recovery_ops[5m])) or vector(0)", "refId": "B", "legendFormat": "Recovery ops/s" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Recovery B/s" }, "properties": [{ "id": "unit", "value": "Bps" }] }, + { "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 } + }, + + { + "type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 } + }, + + { + "type": "timeseries", "id": 21, "title": "PG States Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_pg_clean)", "refId": "A", "legendFormat": "clean" }, + { "expr": "max(ceph_pg_active)", "refId": "B", "legendFormat": "active" }, + { "expr": "max(ceph_pg_degraded)", "refId": "C", "legendFormat": "degraded" }, + { "expr": "max(ceph_pg_undersized)", "refId": "D", "legendFormat": "undersized" }, + { "expr": "max(ceph_pg_peering)", "refId": "E", "legendFormat": "peering" }, + { "expr": "max(ceph_pg_recovering)", "refId": "F", "legendFormat": "recovering" }, + { "expr": "max(ceph_pg_backfilling)", "refId": "G", "legendFormat": "backfilling" }, + { "expr": "max(ceph_pg_remapped)", "refId": "H", "legendFormat": "remapped" }, + { "expr": "max(ceph_pg_inconsistent)", "refId": "I", "legendFormat": "inconsistent" }, + { "expr": "max(ceph_pg_stale)", "refId": "J", "legendFormat": "stale" }, + { "expr": "max(ceph_pg_unknown)", "refId": "K", "legendFormat": "unknown" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0 } + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["max", "lastNotNull"], + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 } + }, + + { + "type": "stat", "id": 22, "title": "Slow Ops", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(ceph_healthcheck_slow_ops) or vector(0)", "refId": "A" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "area", "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 } + }, + + { + "type": "stat", "id": 23, "title": "Misplaced / Degraded Objects", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max(ceph_num_objects_misplaced) or vector(0)", "refId": "A", "legendFormat": "Misplaced" }, + { "expr": "max(ceph_num_objects_degraded) or vector(0)", "refId": "B", "legendFormat": "Degraded" } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ]} + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 } + }, + + { + "type": "row", "id": 24, "title": "OSD Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 } + }, + + { + "type": "table", "id": 25, "title": "OSDs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (ceph_daemon) (ceph_osd_up{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "Up", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_in{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "In", "format": "table", "instant": true }, + { "expr": "100 * max by (ceph_daemon) (ceph_osd_stat_bytes_used{ceph_daemon=~\"$osd\"}) / max by (ceph_daemon) (ceph_osd_stat_bytes{ceph_daemon=~\"$osd\"})", "refId": "C", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_numpg{ceph_daemon=~\"$osd\"})", "refId": "D", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "E", "format": "table", "instant": true }, + { "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "F", "format": "table", "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true }, + "renameByName": { + "ceph_daemon": "OSD", + "Value #A": "Up", + "Value #B": "In", + "Value #C": "Util %", + "Value #D": "PGs", + "Value #E": "Apply Latency", + "Value #F": "Commit Latency" + }, + "indexByName": { + "OSD": 0, "Up": 1, "In": 2, "Util %": 3, "PGs": 4, "Apply Latency": 5, "Commit Latency": 6 + } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Util %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]}} + ] + }, + { "matcher": { "id": "byName", "options": "Apply Latency" }, "properties": [{ "id": "unit", "value": "ms" }] }, + { "matcher": { "id": "byName", "options": "Commit Latency" }, "properties": [{ "id": "unit", "value": "ms" }] }, + { + "matcher": { "id": "byRegexp", "options": "Up|In" }, + "properties": [ + { "id": "mappings", "value": [{ "type": "value", "options": { "0": { "text": "✗", "index": 0 }, "1": { "text": "✓", "index": 1 }}}] }, + { "id": "custom.cellOptions", "value": { "type": "color-text" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}} + ] + } + ] + }, + "gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 } + }, + + { + "type": "timeseries", "id": 26, "title": "OSD Apply + Commit Latency", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (ceph_daemon) (ceph_osd_apply_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "A", "legendFormat": "{{ceph_daemon}} apply" }, + { "expr": "max by (ceph_daemon) (ceph_osd_commit_latency_ms{ceph_daemon=~\"$osd\"})", "refId": "B", "legendFormat": "{{ceph_daemon}} commit" } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0 } + } + }, + "gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 } + }, + + { + "type": "row", "id": 27, "title": "Pool Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } + }, + + { + "type": "table", "id": 28, "title": "Pools", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", "refId": "A", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_objects)", "refId": "B", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_bytes_used)", "refId": "C", "format": "table", "instant": true }, + { "expr": "max by (pool_id) (ceph_pool_max_avail)", "refId": "D", "format": "table", "instant": true }, + { "expr": "100 * max by (pool_id) (ceph_pool_bytes_used) / (max by (pool_id) (ceph_pool_bytes_used) + max by (pool_id) (ceph_pool_max_avail))", "refId": "E", "format": "table", "instant": true } + ], + "transformations": [ + { "id": "merge" }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "Value #A": true }, + "renameByName": { + "pool_id": "ID", + "name": "Pool", + "Value #B": "Objects", + "Value #C": "Used", + "Value #D": "Available", + "Value #E": "Used %" + }, + "indexByName": { "ID": 0, "Pool": 1, "Objects": 2, "Used": 3, "Available": 4, "Used %": 5 } + } + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "gradient" } }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]}} + ] + } + ] + }, + "gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 } + }, + + { + "type": "timeseries", "id": 29, "title": "Pool IOPS (Read / Write) — filtered by $pool", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "max by (pool_id) (rate(ceph_pool_rd[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "A", "legendFormat": "Read — {{name}}" + }, + { + "expr": "max by (pool_id) (rate(ceph_pool_wr[5m]))\n* on(pool_id) group_left(name) max by (pool_id, name) (ceph_pool_metadata{name=~\"$pool\"})", + "refId": "B", "legendFormat": "Write — {{name}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["max", "lastNotNull"], + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + } + }, + "gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index aa08e7a8..0c0336eb 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,6 +1,7 @@ pub mod alert_channel; pub mod alert_rule; pub mod application_monitoring; +pub mod ceph_alerts; pub mod cluster_dashboards; pub mod grafana; pub mod kube_prometheus; diff --git a/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs b/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs new file mode 100644 index 00000000..fb8c7189 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/cluster_alert_rules.rs @@ -0,0 +1,114 @@ +use std::collections::BTreeMap; + +use async_trait::async_trait; +use harmony_types::id::Id; +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{ + PrometheusRule, PrometheusRuleSpec, RuleGroup, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftPrometheusRuleScore { + pub namespace: String, + pub name: String, + pub rule_groups: Vec, + pub labels: Option>, +} + +impl Score for OpenshiftPrometheusRuleScore { + fn name(&self) -> String { + format!( + "OpenshiftPrometheusRuleScore({}/{})", + self.namespace, self.name + ) + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftPrometheusRuleInterpret { + namespace: self.namespace.clone(), + name: self.name.clone(), + rule_groups: self.rule_groups.clone(), + labels: self.labels.clone(), + }) + } +} + +#[derive(Debug, Clone)] +pub struct OpenshiftPrometheusRuleInterpret { + namespace: String, + name: String, + rule_groups: Vec, + labels: Option>, +} + +#[async_trait] +impl Interpret for OpenshiftPrometheusRuleInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let labels = self.labels.clone().unwrap_or_else(default_rule_labels); + + let prometheus_rule = PrometheusRule { + metadata: ObjectMeta { + name: Some(self.name.clone()), + namespace: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + spec: PrometheusRuleSpec { + groups: self.rule_groups.clone(), + }, + }; + + let client = topology + .k8s_client() + .await + .map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?; + + client + .apply(&prometheus_rule, Some(&self.namespace)) + .await + .map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(Outcome::success(format!( + "PrometheusRule '{}' applied to namespace '{}' with {} rule group(s)", + self.name, + self.namespace, + self.rule_groups.len() + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftPrometheusRule") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +fn default_rule_labels() -> BTreeMap { + let mut labels = BTreeMap::new(); + labels.insert("prometheus".to_string(), "k8s".to_string()); + labels.insert("role".to_string(), "alert-rules".to_string()); + labels +} diff --git a/harmony/src/modules/monitoring/okd/mod.rs b/harmony/src/modules/monitoring/okd/mod.rs index ac246c5f..76d8b58b 100644 --- a/harmony/src/modules/monitoring/okd/mod.rs +++ b/harmony/src/modules/monitoring/okd/mod.rs @@ -1,5 +1,6 @@ use crate::topology::oberservability::monitoring::AlertSender; +pub mod cluster_alert_rules; pub mod cluster_monitoring; pub(crate) mod config; pub mod enable_user_workload; -- 2.39.5 From 391c44b369ccee574edf5bd0f55b671bfcd047df Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:29:54 -0400 Subject: [PATCH 06/57] feat: add the datadog-15-k8s-metrics dashboard --- .../dashboards/datadog-15-k8s-metrics.json | 852 ++++++++++++++++++ .../monitoring/cluster_dashboards/score.rs | 6 +- 2 files changed, 857 insertions(+), 1 deletion(-) create mode 100644 harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json new file mode 100644 index 00000000..af699af4 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/datadog-15-k8s-metrics.json @@ -0,0 +1,852 @@ +{ + "title": "Datadog — 15 Key Kubernetes Metrics", + "uid": "datadog-15-k8s-metrics", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["kubernetes", "datadog", "key-metrics", "cluster", "control-plane"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + }, + { + "name": "node", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_node_info, node)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Node", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 100, "type": "row", "title": "Cluster State — metrics 1–3 (Node status, Desired vs current pods, Available vs unavailable pods)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "id": 1, "type": "stat", "title": "Ready Nodes", + "description": "Metric 1 — Node status. Count of nodes with condition Ready=true. A node that drops out of Ready can no longer accept new pods; scheduling freezes until it recovers or is drained.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 } + }, + + { + "id": 2, "type": "stat", "title": "Not Ready Nodes", + "description": "Nodes reporting Ready=false. These nodes cannot host new pods and existing pods may be evicted. Alert immediately.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 } + }, + + { + "id": 3, "type": "stat", "title": "MemoryPressure", + "description": "Nodes flagged by kubelet as being under memory pressure. The kubelet will begin evicting pods that most exceed their memory request.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 } + }, + + { + "id": 4, "type": "stat", "title": "DiskPressure", + "description": "Nodes under disk pressure. Kubelet runs GC (removing unused images and dead containers) and, if space stays low, starts evicting pods.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 } + }, + + { + "id": 5, "type": "stat", "title": "PIDPressure", + "description": "Nodes that have exhausted their PID space. New processes / containers on the node will fail to start.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 } + }, + + { + "id": 6, "type": "stat", "title": "NetworkUnavailable", + "description": "Nodes whose CNI has not (yet) wired the pod network. Pods cannot schedule onto the node until this clears.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"NetworkUnavailable\",status=\"true\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 } + }, + + { + "id": 7, "type": "timeseries", "title": "Deployments — Desired vs Current pods", + "description": "Metric 2 — Desired vs current pods (Deployments). A persistent gap means pods cannot be scheduled: check node capacity, PodDisruptionBudgets, and image pull failures.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_deployment_spec_replicas{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "desired" }, + { "expr": "sum(kube_deployment_status_replicas{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "current" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "desired" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "current" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 8, "type": "timeseries", "title": "Deployments — Available vs Unavailable pods", + "description": "Metric 3 — Available/unavailable (Deployments). Spikes in unavailable are customer-visible: crashes, failed readiness probes, or resource shortages.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "available" }, + { "expr": "sum(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "unavailable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "available" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unavailable" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 9, "type": "table", "title": "Top Deployments with unavailable replicas", + "description": "Deployments that currently report unavailable replicas. Investigate pod events / readiness probes / resource quotas for these.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(20, max by(namespace, deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"}) > 0)", + "refId": "A", "legendFormat": "", "format": "table", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "short", "custom": { "align": "auto" }, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]} + } + }, + "options": { "showHeader": true }, + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "endpoint": true, "service": true, "pod": true, "container": true, "prometheus": true, "container_name": true, "namespace_labels": true }, "renameByName": { "Value": "unavailable" } } } + ], + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 10, "type": "timeseries", "title": "DaemonSets — Desired vs Scheduled", + "description": "Metric 2 — Desired vs current pods (DaemonSets). DaemonSets should have one pod per matching node; a gap means the pod cannot be placed (taints, resources, node selectors).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "desired" }, + { "expr": "sum(kube_daemonset_status_current_number_scheduled{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "scheduled" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "desired" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 } + }, + + { + "id": 11, "type": "timeseries", "title": "DaemonSets — Available vs Unavailable", + "description": "Metric 3 — Available/unavailable (DaemonSets). Unavailable DaemonSet pods often mean per-node infrastructure pods (CNI, logging, monitoring agents) are failing on specific nodes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(kube_daemonset_status_number_available{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "available" }, + { "expr": "sum(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})", "refId": "B", "legendFormat": "unavailable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "available" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unavailable" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 } + }, + + { + "id": 200, "type": "row", "title": "Resources — Memory (metrics 4–6)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 } + }, + + { + "id": 20, "type": "timeseries", "title": "Cluster memory — usage vs requests vs limits", + "description": "Metrics 4–5 — aggregate. Compares how much memory containers actually consume (working set) to what they requested and what they are limited to. A pod that crosses its limit is OOMKilled.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})", "refId": "A", "legendFormat": "usage" }, + { "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})", "refId": "B", "legendFormat": "requests" }, + { "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})", "refId": "C", "legendFormat": "limits" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "usage" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "requests" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "limits" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 } + }, + + { + "id": 21, "type": "timeseries", "title": "Top 15 pods — memory usage / memory limit (%)", + "description": "Metric 4 — pod-level. Pods approaching 100% of their memory limit will be OOMKilled. If a pod persistently sits near the limit, either raise the limit or optimize memory use.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15,\n 100 * sum by(namespace, pod)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n /\n sum by(namespace, pod)(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"memory\",container!=\"\"})\n)", + "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 } + }, + + { + "id": 22, "type": "timeseries", "title": "Node memory — requests vs allocatable", + "description": "Metric 6 — per node. Compares the sum of pod memory requests placed on each node to the node's allocatable memory. If requests approach allocatable, the scheduler can no longer place new pods on that node.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(node)(kube_pod_container_resource_requests{resource=\"memory\",container!=\"\",node=~\"$node\"})", "refId": "A", "legendFormat": "{{node}} — requested" }, + { "expr": "sum by(node)(kube_node_status_allocatable{resource=\"memory\",node=~\"$node\"})", "refId": "B", "legendFormat": "{{node}} — allocatable" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 } + }, + + { + "id": 23, "type": "bargauge", "title": "Node memory commitment (requests / allocatable)", + "description": "How full each node is in terms of scheduled (requested) memory. ≥ 100% means no further pods requesting memory can be scheduled there.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 *\n sum by(node)(kube_pod_container_resource_requests{resource=\"memory\",container!=\"\",node=~\"$node\"})\n /\n sum by(node)(kube_node_status_allocatable{resource=\"memory\",node=~\"$node\"})", + "refId": "A", "legendFormat": "{{node}}", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 } + }, + + { + "id": 300, "type": "row", "title": "Resources — CPU (metrics 8–10)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 } + }, + + { + "id": 30, "type": "timeseries", "title": "Cluster CPU — usage vs requests vs limits", + "description": "Metrics 9–10 — aggregate. Unlike memory, CPU is compressible: exceeding a limit causes throttling (slow), not OOMKill. A persistent gap between usage and limits is fine; a persistent gap between usage and requests wastes capacity.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))", "refId": "A", "legendFormat": "usage" }, + { "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})", "refId": "B", "legendFormat": "requests" }, + { "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})", "refId": "C", "legendFormat": "limits" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "usage" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "requests" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "limits" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 39 } + }, + + { + "id": 31, "type": "timeseries", "title": "Top 15 pods — CPU usage / CPU limit (%)", + "description": "Metric 9 — pod-level. Pods that sit above 100% for long windows are being throttled by the kernel, which causes latency spikes even though the pod is not killed.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15,\n 100 * sum by(namespace, pod)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n /\n sum by(namespace, pod)(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"cpu\",container!=\"\"})\n)", + "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 39 } + }, + + { + "id": 32, "type": "timeseries", "title": "Node CPU — requests vs allocatable", + "description": "Metric 8 — per node. Same shape as memory: once requests saturate allocatable CPU, no more pods requesting CPU can be placed on the node.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(node)(kube_pod_container_resource_requests{resource=\"cpu\",container!=\"\",node=~\"$node\"})", "refId": "A", "legendFormat": "{{node}} — requested" }, + { "expr": "sum by(node)(kube_node_status_allocatable{resource=\"cpu\",node=~\"$node\"})", "refId": "B", "legendFormat": "{{node}} — allocatable" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 47 } + }, + + { + "id": 33, "type": "bargauge", "title": "Node CPU commitment (requests / allocatable)", + "description": "How full each node is in terms of scheduled (requested) CPU. ≥ 100% means no further pods requesting CPU can be scheduled there.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 *\n sum by(node)(kube_pod_container_resource_requests{resource=\"cpu\",container!=\"\",node=~\"$node\"})\n /\n sum by(node)(kube_node_status_allocatable{resource=\"cpu\",node=~\"$node\"})", + "refId": "A", "legendFormat": "{{node}}", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 47 } + }, + + { + "id": 400, "type": "row", "title": "Resources — Disk (metric 7)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } + }, + + { + "id": 40, "type": "timeseries", "title": "Node root filesystem usage (%)", + "description": "Metric 7 — node level. Disk is non-compressible: when it is exhausted, kubelet raises DiskPressure and evicts pods. Alert well before 100%.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "100 * (1 - (\n sum by(instance)(node_filesystem_avail_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n /\n sum by(instance)(node_filesystem_size_bytes{mountpoint=~\"/|/var\",fstype!~\"tmpfs|overlay|squashfs|ramfs\"})\n))", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 56 } + }, + + { + "id": 41, "type": "table", "title": "Top 20 PVC usage (%)", + "description": "Metric 7 — volume level. Persistent volumes that fill up cause write errors inside applications. Alert at ~80% so there is time to expand or free space.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(20,\n 100 * max by(namespace, persistentvolumeclaim)(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\"})\n /\n max by(namespace, persistentvolumeclaim)(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\"})\n)", + "refId": "A", "legendFormat": "", "format": "table", "instant": true + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "custom": { "align": "auto", "cellOptions": { "type": "color-background" } }, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ]} + } + }, + "options": { "showHeader": true }, + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "instance": true, "job": true, "endpoint": true, "service": true, "pod": true, "container": true, "prometheus": true }, "renameByName": { "Value": "usage %" } } } + ], + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 56 } + }, + + { + "id": 500, "type": "row", "title": "Control plane — etcd (metrics 11–12)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 64 } + }, + + { + "id": 50, "type": "stat", "title": "etcd has leader", + "description": "Metric 11 — etcd_server_has_leader. Minimum across members. 0 means at least one member does not see a leader — the cluster may be partitioned or mid-election.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "mappings": [{ + "type": "value", + "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "LEADER OK", "color": "green" } + } + }], + "unit": "short", "noValue": "?" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 65 } + }, + + { + "id": 51, "type": "stat", "title": "Leader changes (last 1h)", + "description": "Metric 12 — etcd_server_leader_changes_seen_total increase over 1h. Frequent elections usually mean network flapping or resource exhaustion on a member.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(increase(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0", "decimals": 0 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 65 } + }, + + { + "id": 52, "type": "timeseries", "title": "Leader changes rate per etcd member", + "description": "Per-member rate of leader transitions. A steady drumbeat on a single member points to that node specifically (its disk, its network).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_server_leader_changes_seen_total[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 65 } + }, + + { + "id": 53, "type": "timeseries", "title": "etcd has-leader per member", + "description": "Per-member value of etcd_server_has_leader. Any dip to 0 is the start of a leader election; frequent dips warrant investigation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "etcd_server_has_leader", "refId": "A", "legendFormat": "{{instance}}" }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "max": 1, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, "drawStyle": "line", "lineInterpolation": "stepAfter" } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["min", "lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 69 } + }, + + { + "id": 600, "type": "row", "title": "Control plane — API Server (metric 13)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 77 } + }, + + { + "id": 60, "type": "timeseries", "title": "API server request rate by verb", + "description": "Metric 13 — request count. Non-streaming calls per second by verb. Read-heavy (GET/LIST) load is usually controllers; write-heavy (POST/PUT/PATCH/DELETE) is user activity or autoscaling.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(verb)(rate(apiserver_request_total{verb!~\"WATCH|CONNECT\"}[5m]))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 78 } + }, + + { + "id": 61, "type": "timeseries", "title": "API server latency p50 / p95 / p99", + "description": "Metric 13 — request duration. Rising p99 with flat p50 is classic tail-latency degradation — look at a single slow resource or an overloaded admission webhook.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 78 } + }, + + { + "id": 62, "type": "timeseries", "title": "API server error rate (HTTP 4xx / 5xx)", + "description": "Error rate by code. 429 = inflight-limit/throttling; 422 = admission-webhook rejections / invalid objects; 500/503 = apiserver faults or etcd unavailability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(code)(rate(apiserver_request_total{code=~\"[45]..\"}[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 86 } + }, + + { + "id": 63, "type": "timeseries", "title": "API server p99 latency by resource", + "description": "Latency broken down by Kubernetes resource — helps identify which object kind (pods, secrets, events…) is the slow one.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(resource, le)(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m]))\n)", + "refId": "A", "legendFormat": "{{resource}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 86 } + }, + + { + "id": 700, "type": "row", "title": "Control plane — Controller Manager & Scheduler (metrics 14–15)", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 94 } + }, + + { + "id": 70, "type": "timeseries", "title": "Workqueue wait (queue_duration) — p99 by queue", + "description": "Metric 14 — how long items sit in each controller's workqueue before being picked up. A rising line indicates the controller can no longer keep up with cluster changes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(name, le)(rate(workqueue_queue_duration_seconds_bucket[5m]))\n)", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 95 } + }, + + { + "id": 71, "type": "timeseries", "title": "Workqueue work (work_duration) — p99 by queue", + "description": "Metric 14 — how long each reconcile actually takes. A rising line points at slow API calls or a slow reconcile loop.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99,\n sum by(name, le)(rate(workqueue_work_duration_seconds_bucket[5m]))\n)", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 95 } + }, + + { + "id": 72, "type": "timeseries", "title": "Scheduler — attempts per second by result", + "description": "Metric 15 — scheduler_schedule_attempts_total. 'unschedulable' = no node meets the pod's requirements (resources, taints, selectors); 'error' = a bug or stale cache in the scheduler.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))", + "refId": "A", "legendFormat": "{{result}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 103 } + }, + + { + "id": 73, "type": "timeseries", "title": "Scheduler — scheduling attempt latency (p50 / p95 / p99)", + "description": "Metric 15 — scheduler attempt duration. The PDF's scheduler_e2e_scheduling_duration_seconds was removed in Kubernetes 1.23; the modern equivalent is scheduler_scheduling_attempt_duration_seconds (time from picking a pod off the queue to binding it). A rising p99 often correlates with an overloaded apiserver or large, highly-constrained pod fleets.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 103 } + } + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/score.rs b/harmony/src/modules/monitoring/cluster_dashboards/score.rs index ed52ed12..7364c3c4 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/score.rs +++ b/harmony/src/modules/monitoring/cluster_dashboards/score.rs @@ -101,7 +101,7 @@ impl Interpret for ClusterDashboardsInterpret { Ok(Outcome::success(format!( "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", - self.namespace, 9 + self.namespace, 10 ))) } @@ -508,6 +508,10 @@ impl ClusterDashboardsInterpret { "okd-alerts-events", include_str!("dashboards/alerts-events-problems.json"), ), + ( + "datadog-15-k8s-metrics", + include_str!("dashboards/datadog-15-k8s-metrics.json"), + ), ]; for (dashboard_name, json_content) in dashboards { -- 2.39.5 From c2718e843b0e928aa5b142b9e278e1a36808a0ef Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:47:12 -0400 Subject: [PATCH 07/57] feat: improve ceph dashboard - list alerts and WHY its NOT green --- .../cluster_dashboards/dashboards/ceph.json | 262 ++++++++++++++++-- 1 file changed, 240 insertions(+), 22 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json index d555511d..6f5e0cc7 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -172,10 +172,228 @@ }, { - "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "type": "row", "id": 100, "title": "Active Issues", "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } }, + { + "type": "stat", "id": 101, "title": "Critical Ceph alerts firing", + "description": "Count of Ceph alert rules currently in firing state with severity=critical. Drives the red tile on the Health stat to concrete action. 0 when the cluster is healthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\",severity=\"critical\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { + "colorMode": "background", "graphMode": "none", "justifyMode": "center", "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 7 } + }, + + { + "type": "stat", "id": 102, "title": "Warning Ceph alerts firing", + "description": "Count of Ceph alert rules currently in firing state with severity=warning. Matches what drives the yellow HEALTH_WARN tile on this dashboard.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\",severity=\"warning\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { + "colorMode": "background", "graphMode": "none", "justifyMode": "center", "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 7 } + }, + + { + "type": "row", "id": 104, "title": "Issue details — click to expand", "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, + "panels": [ + + { + "type": "table", "id": 105, "title": "Active Ceph health checks (ceph health detail)", + "description": "Exactly what `ceph health detail` would show. One row per active health check; the Check column is the Ceph check code (OSD_DOWN, POOL_NEARFULL, PG_DEGRADED, MON_CLOCK_SKEW, etc.). Severity is the Ceph-native HEALTH_WARN / HEALTH_ERR label emitted by the mgr prometheus module. An empty table means Ceph reports no active health checks — the Health tile above should be HEALTH_OK. This is the primary answer to 'why isn't it green?'.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ceph_health_detail == 1", + "refId": "A", "instant": true, "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "Value": true, + "ceph_health_detail":true, + "Time": true, + "prometheus": true, + "container": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true, + "pod": true, + "namespace": true + }, + "renameByName": { + "name": "Check", + "severity": "Severity" + }, + "indexByName": { + "severity": 0, + "name": 1 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left" }, + "noValue": "— HEALTH_OK, no active checks —" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 150 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "HEALTH_ERR": { "text": "HEALTH_ERR", "color": "dark-red", "index": 0 }, + "HEALTH_WARN": { "text": "HEALTH_WARN", "color": "dark-yellow", "index": 1 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Check" }, "properties": [{ "id": "custom.width", "value": 320 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 12 } + }, + + { + "type": "table", "id": 103, "title": "Firing Ceph alerts (Alertmanager view)", + "description": "Instant-query view of every Ceph alert currently firing — the same set that pages oncall through Alertmanager. Usually matches the health-checks table above, plus derived alerts that have no direct ceph_health_detail counterpart (CephDaysUntilFull, CephNodeRootDiskUsage). The ALERTS metric carries labels only, not annotations: alert name plus daemon/pool/instance labels should be enough to identify the problem; run `oc -n openshift-monitoring get prometheusrule ceph-alerts -o yaml` or check Alertmanager for the full summary/description.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ALERTS{alertstate=\"firing\",alertname=~\"Ceph.*\"}", + "refId": "A", "instant": true, "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "alertstate": true, + "__name__": true, + "Value": true, + "ALERTS": true, + "Time": true, + "prometheus": true, + "container": true, + "endpoint": true, + "job": true, + "service": true + }, + "renameByName": { + "alertname": "Alert Name", + "severity": "Severity", + "ceph_daemon": "Ceph Daemon", + "pool_id": "Pool", + "instance": "Node / Instance", + "mountpoint": "Mountpoint", + "namespace": "Namespace" + }, + "indexByName": { + "severity": 0, + "alertname": 1, + "ceph_daemon": 2, + "pool_id": 3, + "instance": 4, + "mountpoint": 5, + "namespace": 6 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left" }, + "noValue": "— no active Ceph issues —" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 110 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 }, + "warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 }, + "info": { "text": "INFO", "color": "dark-blue", "index": 2 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 280 }] }, + { "matcher": { "id": "byName", "options": "Ceph Daemon" }, "properties": [{ "id": "custom.width", "value": 180 }] }, + { "matcher": { "id": "byName", "options": "Pool" }, "properties": [{ "id": "custom.width", "value": 120 }] }, + { "matcher": { "id": "byName", "options": "Node / Instance" }, "properties": [{ "id": "custom.width", "value": 220 }] }, + { "matcher": { "id": "byName", "options": "Mountpoint" }, "properties": [{ "id": "custom.width", "value": 180 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 12 } + } + + ] + }, + + { + "type": "row", "id": 8, "title": "Capacity", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 } + }, + { "type": "gauge", "id": 9, "title": "Cluster Used (%)", "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, @@ -198,7 +416,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "showThresholdLabels": true, "showThresholdMarkers": true }, - "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 13 } }, { @@ -220,7 +438,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value", "graphMode": "none", "textMode": "auto", "orientation": "vertical" }, - "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 13 } }, { @@ -237,7 +455,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 11, "x": 9, "y": 7 } + "gridPos": { "h": 8, "w": 11, "x": 9, "y": 13 } }, { @@ -263,7 +481,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "none", "textMode": "auto" }, - "gridPos": { "h": 8, "w": 4, "x": 20, "y": 7 } + "gridPos": { "h": 8, "w": 4, "x": 20, "y": 13 } }, { @@ -295,7 +513,7 @@ "sortBy": "Value", "sortOrder": "desc" }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 } }, { @@ -323,12 +541,12 @@ "displayMode": "gradient", "showUnfilled": true }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 } }, { "type": "row", "id": 15, "title": "Performance", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 29 } }, { @@ -345,7 +563,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 30 } }, { @@ -362,7 +580,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 30 } }, { @@ -385,7 +603,7 @@ "custom": { "lineWidth": 2, "fillOpacity": 8 } } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 } + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 38 } }, { @@ -405,12 +623,12 @@ { "matcher": { "id": "byName", "options": "Recovery ops/s" }, "properties": [{ "id": "unit", "value": "ops" }] } ] }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 } + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 38 } }, { "type": "row", "id": 20, "title": "Placement Group Health", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 46 } }, { @@ -446,7 +664,7 @@ "sortDesc": true } }, - "gridPos": { "h": 8, "w": 16, "x": 0, "y": 41 } + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 47 } }, { @@ -467,7 +685,7 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "area", "textMode": "auto" }, - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 41 } + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 47 } }, { @@ -490,12 +708,12 @@ "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "background", "graphMode": "none", "textMode": "auto", "orientation": "horizontal" }, - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 45 } + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 51 } }, { "type": "row", "id": 24, "title": "OSD Detail", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 49 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } }, { @@ -561,7 +779,7 @@ } ] }, - "gridPos": { "h": 10, "w": 16, "x": 0, "y": 50 } + "gridPos": { "h": 10, "w": 16, "x": 0, "y": 56 } }, { @@ -578,12 +796,12 @@ "custom": { "lineWidth": 1, "fillOpacity": 0 } } }, - "gridPos": { "h": 10, "w": 8, "x": 16, "y": 50 } + "gridPos": { "h": 10, "w": 8, "x": 16, "y": 56 } }, { "type": "row", "id": 27, "title": "Pool Detail", "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 60 } + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 66 } }, { @@ -634,7 +852,7 @@ } ] }, - "gridPos": { "h": 10, "w": 14, "x": 0, "y": 61 } + "gridPos": { "h": 10, "w": 14, "x": 0, "y": 67 } }, { @@ -667,7 +885,7 @@ "sortDesc": true } }, - "gridPos": { "h": 10, "w": 10, "x": 14, "y": 61 } + "gridPos": { "h": 10, "w": 10, "x": 14, "y": 67 } } ] -- 2.39.5 From 349c2a13583f19cf49c6e6d9b69fbed8c31d538a Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Mon, 20 Apr 2026 15:58:52 -0400 Subject: [PATCH 08/57] feat: improve ceph dashboard --- .../monitoring/cluster_dashboards/dashboards/ceph.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json index 6f5e0cc7..f54db405 100644 --- a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/ceph.json @@ -278,7 +278,7 @@ { "matcher": { "id": "byName", "options": "Severity" }, "properties": [ - { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }, { "id": "custom.width", "value": 150 }, { "id": "mappings", @@ -357,7 +357,7 @@ { "matcher": { "id": "byName", "options": "Severity" }, "properties": [ - { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.cellOptions", "value": { "type": "color-background", "mode": "basic" } }, { "id": "custom.width", "value": 110 }, { "id": "mappings", -- 2.39.5 From bf4f300383b6086b94646ecab0ef7f1901aa9a6f Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 21 Apr 2026 10:13:58 -0400 Subject: [PATCH 09/57] feat(discovery): capture bond, blacklist and bond-mode intent per host Extend DiscoverHostForRoleScore with three new interactive prompts after the installation-disk selection: - "Configure a network bond?" (only when host has >= 2 NICs), followed by a multi-select of bond members (min 2) and a bond-mode picker (LACP / active-backup / balance-rr / balance-xor / broadcast / balance-tlb / balance-alb). - "Blacklist any remaining interface?", with candidates limited to NICs not already claimed by the bond. The answers are persisted as a JSON-encoded NetworkConfig on a new host_role_mapping.network_config column. HostConfig now exposes network_config alongside installation_device so downstream scores can honor the user's intent. Also adds a new harmony_host_discovery example that discovers a single host on 192.168.40.0/24:25000. --- ...cd256d74f572629b8c0764782066e705c50c.json} | 6 +- ...52a9193dcb09a4b917f0fde9f39058e0f276.json} | 10 +- ...090c94a222115c543231f2140cba27bd0f067.json | 2 +- Cargo.lock | 13 ++ examples/harmony_host_discovery/Cargo.toml | 15 ++ examples/harmony_host_discovery/env.sh | 4 + examples/harmony_host_discovery/src/main.rs | 27 +++ harmony/src/domain/inventory/repository.rs | 6 +- harmony/src/domain/topology/host_binding.rs | 59 +++++- harmony/src/infra/inventory/sqlite.rs | 21 ++- harmony/src/modules/inventory/discovery.rs | 170 +++++++++++++++++- ...dd_network_config_to_host_role_mapping.sql | 3 + 12 files changed, 321 insertions(+), 15 deletions(-) rename .sqlx/{query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json => query-165b944d13c8f7810b4e3ef891e5cd256d74f572629b8c0764782066e705c50c.json} (50%) rename .sqlx/{query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json => query-43cfa7b6dda8b9745ef74eb45f3f52a9193dcb09a4b917f0fde9f39058e0f276.json} (55%) create mode 100644 examples/harmony_host_discovery/Cargo.toml create mode 100644 examples/harmony_host_discovery/env.sh create mode 100644 examples/harmony_host_discovery/src/main.rs create mode 100644 migrations/20260421000000_add_network_config_to_host_role_mapping.sql diff --git a/.sqlx/query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json b/.sqlx/query-165b944d13c8f7810b4e3ef891e5cd256d74f572629b8c0764782066e705c50c.json similarity index 50% rename from .sqlx/query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json rename to .sqlx/query-165b944d13c8f7810b4e3ef891e5cd256d74f572629b8c0764782066e705c50c.json index d3f774b8..deacd686 100644 --- a/.sqlx/query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json +++ b/.sqlx/query-165b944d13c8f7810b4e3ef891e5cd256d74f572629b8c0764782066e705c50c.json @@ -1,12 +1,12 @@ { "db_name": "SQLite", - "query": "\n INSERT INTO host_role_mapping (host_id, role, installation_device)\n VALUES (?, ?, ?)\n ", + "query": "\n INSERT INTO host_role_mapping (host_id, role, installation_device, network_config)\n VALUES (?, ?, ?, ?)\n ", "describe": { "columns": [], "parameters": { - "Right": 3 + "Right": 4 }, "nullable": [] }, - "hash": "6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6" + "hash": "165b944d13c8f7810b4e3ef891e5cd256d74f572629b8c0764782066e705c50c" } diff --git a/.sqlx/query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json b/.sqlx/query-43cfa7b6dda8b9745ef74eb45f3f52a9193dcb09a4b917f0fde9f39058e0f276.json similarity index 55% rename from .sqlx/query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json rename to .sqlx/query-43cfa7b6dda8b9745ef74eb45f3f52a9193dcb09a4b917f0fde9f39058e0f276.json index 60209751..b899023d 100644 --- a/.sqlx/query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json +++ b/.sqlx/query-43cfa7b6dda8b9745ef74eb45f3f52a9193dcb09a4b917f0fde9f39058e0f276.json @@ -1,6 +1,6 @@ { "db_name": "SQLite", - "query": "SELECT host_id, installation_device FROM host_role_mapping WHERE role = ?", + "query": "SELECT host_id, installation_device, network_config FROM host_role_mapping WHERE role = ?", "describe": { "columns": [ { @@ -12,6 +12,11 @@ "name": "installation_device", "ordinal": 1, "type_info": "Text" + }, + { + "name": "network_config", + "ordinal": 2, + "type_info": "Text" } ], "parameters": { @@ -19,8 +24,9 @@ }, "nullable": [ false, + true, true ] }, - "hash": "24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b" + "hash": "43cfa7b6dda8b9745ef74eb45f3f52a9193dcb09a4b917f0fde9f39058e0f276" } diff --git a/.sqlx/query-8d247918eca10a88b784ee353db090c94a222115c543231f2140cba27bd0f067.json b/.sqlx/query-8d247918eca10a88b784ee353db090c94a222115c543231f2140cba27bd0f067.json index 0b92e37a..ba998bc8 100644 --- a/.sqlx/query-8d247918eca10a88b784ee353db090c94a222115c543231f2140cba27bd0f067.json +++ b/.sqlx/query-8d247918eca10a88b784ee353db090c94a222115c543231f2140cba27bd0f067.json @@ -16,7 +16,7 @@ { "name": "data: Json", "ordinal": 2, - "type_info": "Blob" + "type_info": "Null" } ], "parameters": { diff --git a/Cargo.lock b/Cargo.lock index 007854cc..86a77a4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3819,6 +3819,19 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "harmony_host_discovery" +version = "0.1.0" +dependencies = [ + "cidr", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "tokio", + "url", +] + [[package]] name = "harmony_i18n" version = "0.1.0" diff --git a/examples/harmony_host_discovery/Cargo.toml b/examples/harmony_host_discovery/Cargo.toml new file mode 100644 index 00000000..c043f434 --- /dev/null +++ b/examples/harmony_host_discovery/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "harmony_host_discovery" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_macros = { path = "../../harmony_macros" } +harmony_types = { path = "../../harmony_types" } +tokio.workspace = true +url.workspace = true +cidr.workspace = true diff --git a/examples/harmony_host_discovery/env.sh b/examples/harmony_host_discovery/env.sh new file mode 100644 index 00000000..0b9da4f6 --- /dev/null +++ b/examples/harmony_host_discovery/env.sh @@ -0,0 +1,4 @@ +export HARMONY_SECRET_NAMESPACE=host-discovery +export HARMONY_SECRET_STORE=file +export HARMONY_DATABASE_URL=sqlite://harmony_host_discovery.sqlite +export RUST_LOG=harmony=debug diff --git a/examples/harmony_host_discovery/src/main.rs b/examples/harmony_host_discovery/src/main.rs new file mode 100644 index 00000000..98140d03 --- /dev/null +++ b/examples/harmony_host_discovery/src/main.rs @@ -0,0 +1,27 @@ +use harmony::{ + inventory::{HostRole, Inventory}, + modules::inventory::{DiscoverHostForRoleScore, HarmonyDiscoveryStrategy}, + topology::LocalhostTopology, +}; +use harmony_macros::cidrv4; + +#[tokio::main] +async fn main() { + let discover_one_host = DiscoverHostForRoleScore { + role: HostRole::Worker, + number_desired_hosts: 1, + discovery_strategy: HarmonyDiscoveryStrategy::SUBNET { + cidr: cidrv4!("192.168.40.0/24"), + port: 25000, + }, + }; + + harmony_cli::run( + Inventory::autoload(), + LocalhostTopology::new(), + vec![Box::new(discover_one_host)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony/src/domain/inventory/repository.rs b/harmony/src/domain/inventory/repository.rs index e6a4eea8..de291528 100644 --- a/harmony/src/domain/inventory/repository.rs +++ b/harmony/src/domain/inventory/repository.rs @@ -1,7 +1,10 @@ use async_trait::async_trait; use crate::{ - hardware::PhysicalHost, interpret::InterpretError, inventory::HostRole, topology::HostConfig, + hardware::PhysicalHost, + interpret::InterpretError, + inventory::HostRole, + topology::{HostConfig, NetworkConfig}, }; /// Errors that can occur within the repository layer. @@ -40,5 +43,6 @@ pub trait InventoryRepository: Send + Sync + 'static { role: &HostRole, host: &PhysicalHost, installation_device: &String, + network_config: &NetworkConfig, ) -> Result<(), RepoError>; } diff --git a/harmony/src/domain/topology/host_binding.rs b/harmony/src/domain/topology/host_binding.rs index 63352762..90186fea 100644 --- a/harmony/src/domain/topology/host_binding.rs +++ b/harmony/src/domain/topology/host_binding.rs @@ -1,5 +1,5 @@ use derive_new::new; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::hardware::PhysicalHost; @@ -20,4 +20,61 @@ pub struct HostBinding { #[derive(Debug, new, Clone, Serialize)] pub struct HostConfig { pub installation_device: Option, + #[new(default)] + pub network_config: NetworkConfig, +} + +/// User-provided networking intent captured at discovery time. +/// +/// Produced by the interactive discovery flow and persisted alongside the role +/// mapping so downstream Scores can act on it (e.g. configuring a bond on the +/// chosen interfaces and avoiding blacklisted ones). +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct NetworkConfig { + pub bond: Option, + pub blacklisted_interfaces: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BondConfig { + pub interfaces: Vec, + pub mode: BondMode, +} + +/// Linux kernel bonding modes. +/// +/// Names match the `bonding` driver's `mode` parameter. See +/// for +/// detail on each mode's failover and load-balancing behaviour. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +pub enum BondMode { + /// mode 0 — round-robin across slaves. + BalanceRr, + /// mode 1 — only one slave active at a time; the other(s) take over on failure. + ActiveBackup, + /// mode 2 — XOR-based slave selection by (src MAC ⊕ dst MAC). + BalanceXor, + /// mode 3 — transmit everything on every slave. + Broadcast, + /// mode 4 — IEEE 802.3ad dynamic link aggregation (LACP). Requires switch support. + Lacp, + /// mode 5 — adaptive transmit load balancing; no switch support required. + BalanceTlb, + /// mode 6 — adaptive load balancing (TLB + receive load balancing via ARP negotiation). + BalanceAlb, +} + +impl std::fmt::Display for BondMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + BondMode::BalanceRr => "balance-rr (mode 0) — round-robin", + BondMode::ActiveBackup => "active-backup (mode 1) — failover, no switch support needed", + BondMode::BalanceXor => "balance-xor (mode 2) — XOR hash", + BondMode::Broadcast => "broadcast (mode 3) — transmit on all slaves", + BondMode::Lacp => "802.3ad / LACP (mode 4) — dynamic link aggregation", + BondMode::BalanceTlb => "balance-tlb (mode 5) — adaptive transmit load balancing", + BondMode::BalanceAlb => "balance-alb (mode 6) — adaptive load balancing", + }; + f.write_str(s) + } } diff --git a/harmony/src/infra/inventory/sqlite.rs b/harmony/src/infra/inventory/sqlite.rs index 3ce1654f..56c3a4fd 100644 --- a/harmony/src/infra/inventory/sqlite.rs +++ b/harmony/src/infra/inventory/sqlite.rs @@ -1,7 +1,7 @@ use crate::{ hardware::PhysicalHost, inventory::{HostRole, InventoryRepository, RepoError}, - topology::HostConfig, + topology::{HostConfig, NetworkConfig}, }; use async_trait::async_trait; use harmony_types::id::Id; @@ -109,17 +109,21 @@ impl InventoryRepository for SqliteInventoryRepository { role: &HostRole, host: &PhysicalHost, installation_device: &String, + network_config: &NetworkConfig, ) -> Result<(), RepoError> { let host_id = host.id.to_string(); + let network_config_json = serde_json::to_string(network_config) + .map_err(|e| RepoError::Serialization(e.to_string()))?; sqlx::query!( r#" - INSERT INTO host_role_mapping (host_id, role, installation_device) - VALUES (?, ?, ?) + INSERT INTO host_role_mapping (host_id, role, installation_device, network_config) + VALUES (?, ?, ?, ?) "#, host_id, role, - installation_device + installation_device, + network_config_json, ) .execute(&self.pool) .await?; @@ -136,13 +140,14 @@ impl InventoryRepository for SqliteInventoryRepository { struct HostIdRow { host_id: String, installation_device: Option, + network_config: Option, } let role_str = format!("{:?}", role); let host_id_rows = sqlx::query_as!( HostIdRow, - "SELECT host_id, installation_device FROM host_role_mapping WHERE role = ?", + "SELECT host_id, installation_device, network_config FROM host_role_mapping WHERE role = ?", role_str ) .fetch_all(&self.pool) @@ -159,8 +164,14 @@ impl InventoryRepository for SqliteInventoryRepository { ))); } }; + let network_config = match row.network_config.as_deref() { + Some(json) => serde_json::from_str(json) + .map_err(|e| RepoError::Deserialization(e.to_string()))?, + None => NetworkConfig::default(), + }; let host_config = HostConfig { installation_device: row.installation_device, + network_config, }; hosts.push((physical_host, host_config)); } diff --git a/harmony/src/modules/inventory/discovery.rs b/harmony/src/modules/inventory/discovery.rs index bd3f7186..9d037c3e 100644 --- a/harmony/src/modules/inventory/discovery.rs +++ b/harmony/src/modules/inventory/discovery.rs @@ -1,16 +1,18 @@ use async_trait::async_trait; +use harmony_inventory_agent::hwinfo::NetworkInterface; use harmony_types::id::Id; use log::{error, info}; use serde::{Deserialize, Serialize}; use crate::{ data::Version, + hardware::PhysicalHost, infra::inventory::InventoryRepositoryFactory, interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, inventory::{HostRole, Inventory}, modules::inventory::{HarmonyDiscoveryStrategy, LaunchDiscoverInventoryAgentScore}, score::Score, - topology::Topology, + topology::{BondConfig, BondMode, NetworkConfig, Topology}, }; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -117,8 +119,16 @@ impl Interpret for DiscoverHostForRoleInterpret { .map(|(_, name)| name.clone()) .unwrap(); info!("Selected disk {} for node {}", disk_name, choice.summary()); + + let network_config = prompt_network_config(&choice)?; + host_repo - .save_role_mapping(&self.score.role, &choice, &disk_name) + .save_role_mapping( + &self.score.role, + &choice, + &disk_name, + &network_config, + ) .await?; chosen_hosts.push(choice); } @@ -179,3 +189,159 @@ impl Interpret for DiscoverHostForRoleInterpret { todo!() } } + +/// Interactively ask the user how the host's networking should be set up. +/// +/// Skips both prompts when the host has fewer than two network interfaces +/// — bonding requires at least two, and blacklisting a single NIC would leave +/// the host unreachable. The resulting [`NetworkConfig`] is persisted alongside +/// the role mapping so downstream Scores can act on it later. +fn prompt_network_config(host: &PhysicalHost) -> Result { + if host.network.len() < 2 { + info!( + "Host {} has {} network interface(s); skipping bond/blacklist prompts", + host.summary(), + host.network.len() + ); + return Ok(NetworkConfig::default()); + } + + let format_iface = |nic: &NetworkInterface| -> String { + let speed = nic + .speed_mbps + .map(|s| format!("{}Mbps", s)) + .unwrap_or_else(|| "?Mbps".to_string()); + let state = if nic.is_up { "up" } else { "down" }; + let ips = if nic.ipv4_addresses.is_empty() { + String::new() + } else { + format!(" [{}]", nic.ipv4_addresses.join(",")) + }; + format!( + "{} ({}) - {} - {} - driver {}{}", + nic.name, nic.mac_address, speed, state, nic.driver, ips + ) + }; + + let options: Vec<(String, String)> = host + .network + .iter() + .map(|nic| (format_iface(nic), nic.name.clone())) + .collect(); + + // --- Bond --- + let wants_bond = inquire::Confirm::new(&format!( + "Host {} has {} interfaces. Configure a network bond?", + host.summary(), + host.network.len() + )) + .with_default(false) + .prompt() + .map_err(|e| InterpretError::new(format!("Could not ask about bond: {e}")))?; + + let bond = if wants_bond { + let display_refs: Vec<&str> = options.iter().map(|(d, _)| d.as_str()).collect(); + let selected = inquire::MultiSelect::new( + "Select the interfaces to include in the bond:", + display_refs, + ) + .with_validator(|choices: &[inquire::list_option::ListOption<&&str>]| { + if choices.len() < 2 { + Ok(inquire::validator::Validation::Invalid( + "Select at least two interfaces for a bond".into(), + )) + } else { + Ok(inquire::validator::Validation::Valid) + } + }) + .prompt() + .map_err(|e| InterpretError::new(format!("Could not select bond interfaces: {e}")))?; + + let interfaces: Vec = options + .iter() + .filter(|(display, _)| selected.iter().any(|s| *s == display.as_str())) + .map(|(_, name)| name.clone()) + .collect(); + + let mode_choices = vec![ + BondMode::Lacp, + BondMode::ActiveBackup, + BondMode::BalanceRr, + BondMode::BalanceXor, + BondMode::Broadcast, + BondMode::BalanceTlb, + BondMode::BalanceAlb, + ]; + let mode = inquire::Select::new("Select the bond mode:", mode_choices) + .with_starting_cursor(0) + .prompt() + .map_err(|e| InterpretError::new(format!("Could not select bond mode: {e}")))?; + + info!( + "Bond configured for host {} on interfaces [{}] with mode {}", + host.summary(), + interfaces.join(", "), + mode + ); + Some(BondConfig { interfaces, mode }) + } else { + None + }; + + // --- Blacklist --- + // Candidates exclude any interface already claimed by the bond. + let bond_members: Vec<&String> = bond + .as_ref() + .map(|b| b.interfaces.iter().collect()) + .unwrap_or_default(); + + let blacklist_candidates: Vec<(String, String)> = options + .iter() + .filter(|(_, name)| !bond_members.iter().any(|b| *b == name)) + .cloned() + .collect(); + + let blacklisted_interfaces = if blacklist_candidates.is_empty() { + Vec::new() + } else { + let wants_blacklist = inquire::Confirm::new("Blacklist any remaining interface?") + .with_default(false) + .prompt() + .map_err(|e| InterpretError::new(format!("Could not ask about blacklist: {e}")))?; + + if wants_blacklist { + let display_refs: Vec<&str> = blacklist_candidates + .iter() + .map(|(d, _)| d.as_str()) + .collect(); + let selected = + inquire::MultiSelect::new("Select the interfaces to blacklist:", display_refs) + .prompt() + .map_err(|e| { + InterpretError::new(format!("Could not select blacklisted interfaces: {e}")) + })?; + + let names: Vec = blacklist_candidates + .iter() + .filter(|(display, _)| selected.iter().any(|s| *s == display.as_str())) + .map(|(_, name)| name.clone()) + .collect(); + + if !names.is_empty() { + info!( + "Blacklisted interfaces on host {}: {}", + host.summary(), + names.join(", ") + ); + } + names + } else { + Vec::new() + } + }; + + Ok(NetworkConfig { + bond, + blacklisted_interfaces, + }) +} diff --git a/migrations/20260421000000_add_network_config_to_host_role_mapping.sql b/migrations/20260421000000_add_network_config_to_host_role_mapping.sql new file mode 100644 index 00000000..98a213d7 --- /dev/null +++ b/migrations/20260421000000_add_network_config_to_host_role_mapping.sql @@ -0,0 +1,3 @@ +-- Add network_config column to host_role_mapping. +-- Stores a JSON-encoded NetworkConfig (bond selection + interface blacklist). +ALTER TABLE host_role_mapping ADD COLUMN network_config TEXT; -- 2.39.5 From bdba4dda275b0cfb21b9f4c7d607ba50b5fd86e6 Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 21 Apr 2026 10:35:48 -0400 Subject: [PATCH 10/57] feat(discovery): tighten host summary and readability of prompts - PhysicalHost::summary() becomes terser and more informative: - Storage: "400 GB [8 GB, 477 GB]" (was "400 GB Storage (2 Disks [8 GB, 477 GB])"). Single-disk collapses to just the total. - Network: list every NIC as "[ip, mac]" with a count prefix (e.g. "3 NICs: [192.168.40.10, 98:fa:9b:03:17:6f], [00:e0:ed:7a:ec:4d], ..."). Single-NIC form drops the count and "s": "NIC: [ip, mac]". NICs without an IPv4 render as "[mac]". - Promote the inventory agent's Chipset { vendor, name } into a "system-product-name" label during host conversion (both MDNS and CIDR flows), so summary()'s first field shows "LENOVO 3136" instead of falling back to the HostCategory string ("Server"). Extracted into build_discovered_host_labels() to keep the two conversion sites in sync. When the chipset is blank, the old category fallback still applies. - Print a blank line before every interactive inquire prompt in the discovery flow (role pick, disk pick, bond confirm/multi-select/mode, blacklist confirm/multi-select) so prompts stand out from the preceding log output on the terminal. --- harmony/src/domain/hardware/mod.rs | 46 ++++++++------------ harmony/src/modules/inventory/discovery.rs | 7 ++++ harmony/src/modules/inventory/mod.rs | 49 +++++++++++++++++----- 3 files changed, 63 insertions(+), 39 deletions(-) diff --git a/harmony/src/domain/hardware/mod.rs b/harmony/src/domain/hardware/mod.rs index 2d7a0347..b883318b 100644 --- a/harmony/src/domain/hardware/mod.rs +++ b/harmony/src/domain/hardware/mod.rs @@ -94,7 +94,6 @@ impl PhysicalHost { if !self.storage.is_empty() { let total_storage_bytes = self.storage.iter().map(|d| d.size_bytes).sum::(); let drive_count = self.storage.len(); - let first_drive_model = &self.storage[0].model; // Helper to format bytes into TB or GB let format_storage = |bytes: u64| { @@ -115,40 +114,31 @@ impl PhysicalHost { .collect::>() .join(", "); - format!( - "{} Storage ({} Disks [{}])", - format_storage(total_storage_bytes), - drive_count, - drive_sizes - ) + format!("{} [{}]", format_storage(total_storage_bytes), drive_sizes) } else { - format!( - "{} Storage ({})", - format_storage(total_storage_bytes), - first_drive_model - ) + format_storage(total_storage_bytes) }; parts.push(storage_summary); } - // Part 5: Network Information - // Prioritize an "up" interface with an IPv4 address - let best_nic = self - .network - .iter() - .find(|n| n.is_up && !n.ipv4_addresses.is_empty()) - .or_else(|| self.network.first()); + // Part 5: Network Information — list every NIC with its IPv4 (when present) and MAC. + if !self.network.is_empty() { + let per_nic: Vec = self + .network + .iter() + .map(|nic| { + let mac = nic.mac_address.to_string(); + match nic.ipv4_addresses.first() { + Some(ip) => format!("[{}, {}]", ip, mac), + None => format!("[{}]", mac), + } + }) + .collect(); - if let Some(nic) = best_nic { - let speed = nic - .speed_mbps - .map(|s| format!("{}Gbps", s / 1000)) - .unwrap_or_else(|| "N/A".to_string()); - let mac = nic.mac_address.to_string(); - let nic_summary = if let Some(ip) = nic.ipv4_addresses.first() { - format!("NIC: {} ({}, {})", speed, ip, mac) + let nic_summary = if per_nic.len() == 1 { + format!("NIC: {}", per_nic[0]) } else { - format!("NIC: {} ({})", speed, mac) + format!("{} NICs: {}", per_nic.len(), per_nic.join(", ")) }; parts.push(nic_summary); } diff --git a/harmony/src/modules/inventory/discovery.rs b/harmony/src/modules/inventory/discovery.rs index 9d037c3e..9aedef30 100644 --- a/harmony/src/modules/inventory/discovery.rs +++ b/harmony/src/modules/inventory/discovery.rs @@ -70,6 +70,7 @@ impl Interpret for DiscoverHostForRoleInterpret { continue; } + println!(); let ans = inquire::Select::new( &format!("Select the node to be used for role {:?}:", self.score.role), all_hosts, @@ -105,6 +106,7 @@ impl Interpret for DiscoverHostForRoleInterpret { let display_refs: Vec<&str> = disk_choices.iter().map(|(d, _)| d.as_str()).collect(); + println!(); let disk_choice = inquire::Select::new( &format!("Select the disk to use on host {}:", choice.summary()), display_refs, @@ -230,6 +232,7 @@ fn prompt_network_config(host: &PhysicalHost) -> Result Result = options.iter().map(|(d, _)| d.as_str()).collect(); + println!(); let selected = inquire::MultiSelect::new( "Select the interfaces to include in the bond:", display_refs, @@ -272,6 +276,7 @@ fn prompt_network_config(host: &PhysicalHost) -> Result Result Result Vec