diff --git a/Cargo.lock b/Cargo.lock index 4cf88dd..db2929d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1262,22 +1262,6 @@ dependencies = [ "url", ] -[[package]] -name = "brocade-switch-oricom-configuration" -version = "0.1.0" -dependencies = [ - "async-trait", - "brocade", - "env_logger", - "harmony", - "harmony_cli", - "harmony_macros", - "harmony_types", - "log", - "serde", - "tokio", -] - [[package]] name = "brotli" version = "8.0.2" @@ -2650,6 +2634,29 @@ dependencies = [ "url", ] +[[package]] +name = "example-cluster-dashboards" +version = "0.1.0" +dependencies = [ + "env_logger", + "harmony", + "harmony_cli", + "harmony_types", + "log", + "tokio", +] + +[[package]] +name = "example-grafana" +version = "0.1.0" +dependencies = [ + "harmony", + "harmony_cli", + "harmony_types", + "log", + "tokio", +] + [[package]] name = "example-harmony-sso" version = "0.1.0" diff --git a/examples/cluster_dashboards/Cargo.toml b/examples/cluster_dashboards/Cargo.toml new file mode 100644 index 0000000..7845145 --- /dev/null +++ b/examples/cluster_dashboards/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "example-cluster-dashboards" +edition = "2021" +version = "0.1.0" +license = "GNU AGPL v3" +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] } +log = "0.4" +env_logger = "0.11" diff --git a/examples/cluster_dashboards/src/main.rs b/examples/cluster_dashboards/src/main.rs new file mode 100644 index 0000000..d587016 --- /dev/null +++ b/examples/cluster_dashboards/src/main.rs @@ -0,0 +1,20 @@ +use harmony::{ + inventory::Inventory, modules::monitoring::cluster_dashboards::ClusterDashboardsScore, + topology::K8sAnywhereTopology, +}; + +#[tokio::main] +async fn main() { + harmony_cli::cli_logger::init(); + + let cluster_dashboards_score = ClusterDashboardsScore::default(); + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(cluster_dashboards_score)], + None, + ) + .await + .unwrap(); +} diff --git a/examples/grafana/Cargo.toml b/examples/grafana/Cargo.toml new file mode 100644 index 0000000..0758351 --- /dev/null +++ b/examples/grafana/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "example-grafana" +edition = "2021" +version = "0.1.0" +license = "GNU AGPL v3" +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] } +log = "0.4" diff --git a/examples/grafana/env.sh b/examples/grafana/env.sh new file mode 100644 index 0000000..5a58931 --- /dev/null +++ b/examples/grafana/env.sh @@ -0,0 +1,5 @@ +export HARMONY_SECRET_NAMESPACE=example-grafana +export HARMONY_SECRET_STORE=file +export HARMONY_DATABASE_URL=sqlite://harmony_grafana.sqlite +export RUST_LOG=harmony=debug +export HARMONY_USE_LOCAL_K3D=false diff --git a/examples/grafana/src/main.rs b/examples/grafana/src/main.rs new file mode 100644 index 0000000..58a374a --- /dev/null +++ b/examples/grafana/src/main.rs @@ -0,0 +1,31 @@ +use harmony::{ + inventory::Inventory, + modules::monitoring::{ + cluster_dashboards::ClusterDashboardsScore, + grafana::helm::helm_grafana::GrafanaOperatorScore, + }, + topology::K8sAnywhereTopology, +}; + +const GRAFANA_OPERATOR_CHART_VERSION: &str = "v5.22.2"; + +#[tokio::main] +async fn main() { + harmony_cli::cli_logger::init(); + + let grafana_operator = + GrafanaOperatorScore::new("grafana", Some(GRAFANA_OPERATOR_CHART_VERSION)); + let cluster_dashboards_score = ClusterDashboardsScore::default(); + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![ + Box::new(grafana_operator), + Box::new(cluster_dashboards_score), + ], + None, + ) + .await + .unwrap(); +} diff --git a/harmony-k8s/src/domain.rs b/harmony-k8s/src/domain.rs new file mode 100644 index 0000000..d1115c5 --- /dev/null +++ b/harmony-k8s/src/domain.rs @@ -0,0 +1,117 @@ +use kube::Error; +use kube::api::GroupVersionKind; +use log::{debug, trace, warn}; + +use crate::client::K8sClient; +use crate::types::KubernetesDistribution; + +impl K8sClient { + /// Resolve an external hostname for the given service name by querying the + /// cluster's ingress infrastructure. + /// + /// Detection order: + /// 1. **OpenShift** — reads `status.domain` from the default + /// `IngressController` in `openshift-ingress-operator`. + /// 2. **NGINX Ingress Controller** — looks for well-known Services in + /// common namespaces and extracts the LoadBalancer hostname. + /// 3. **Fallback** — returns internal cluster DNS + /// (`{service}.default.svc.cluster.local`). + pub async fn get_domain(&self, service: &str) -> Result { + let distribution = self.get_k8s_distribution().await?; + + if matches!(distribution, KubernetesDistribution::OpenshiftFamily) { + if let Some(domain) = self.try_openshift_ingress_domain().await? { + return Ok(format!("{service}.{domain}")); + } + } + + if let Some(domain) = self.try_nginx_lb_domain().await? { + return Ok(format!("{service}.{domain}")); + } + + warn!("Could not determine external ingress domain; falling back to internal-only DNS"); + Ok(format!("{service}.default.svc.cluster.local")) + } + + async fn try_openshift_ingress_domain(&self) -> Result, Error> { + let gvk = GroupVersionKind { + group: "operator.openshift.io".into(), + version: "v1".into(), + kind: "IngressController".into(), + }; + + let ic = match self + .get_resource_json_value("default", Some("openshift-ingress-operator"), &gvk) + .await + { + Ok(ic) => ic, + Err(e) => { + debug!("Could not fetch OpenShift IngressController: {e}"); + return Ok(None); + } + }; + + let replicas = ic.data["status"]["availableReplicas"].as_i64().unwrap_or(0); + if replicas < 1 { + debug!("OpenShift IngressController present but no available replicas"); + return Ok(None); + } + + if let Some(domain) = ic.data["status"]["domain"].as_str() { + trace!("OpenShift IngressController domain: {domain}"); + return Ok(Some(domain.to_string())); + } + + warn!("OpenShift IngressController present but no status.domain set"); + Ok(None) + } + + async fn try_nginx_lb_domain(&self) -> Result, Error> { + let svc_gvk = GroupVersionKind { + group: "".into(), + version: "v1".into(), + kind: "Service".into(), + }; + + let candidates = [ + ("ingress-nginx", "ingress-nginx-controller"), + ("ingress-nginx", "ingress-nginx-controller-internal"), + ("ingress-nginx", "ingress-nginx"), + ("kube-system", "ingress-nginx-controller"), + ]; + + for (ns, name) in candidates { + trace!("Checking NGINX Service {ns}/{name} for LoadBalancer hostname"); + if let Ok(svc) = self.get_resource_json_value(name, Some(ns), &svc_gvk).await { + let lb_hosts = svc.data["status"]["loadBalancer"]["ingress"] + .as_array() + .cloned() + .unwrap_or_default(); + for entry in lb_hosts { + if let Some(host) = entry.get("hostname").and_then(|v| v.as_str()) { + debug!("Found NGINX LB hostname: {host}"); + if let Some(domain) = extract_base_domain(host) { + return Ok(Some(domain)); + } else { + return Ok(Some(host.to_string())); + } + } + if let Some(ip) = entry.get("ip").and_then(|v| v.as_str()) { + debug!("NGINX LB exposes IP {ip} (no hostname); skipping"); + } + } + } + } + + Ok(None) + } +} + +fn extract_base_domain(host: &str) -> Option { + let parts: Vec<&str> = host.split('.').collect(); + if parts.len() >= 2 { + Some(parts[parts.len() - 2..].join(".")) + } else { + None + } +} diff --git a/harmony-k8s/src/lib.rs b/harmony-k8s/src/lib.rs index 2704c65..1943540 100644 --- a/harmony-k8s/src/lib.rs +++ b/harmony-k8s/src/lib.rs @@ -3,6 +3,7 @@ pub mod bundle; pub mod client; pub mod config; pub mod discovery; +pub mod domain; pub mod helper; pub mod node; pub mod pod; diff --git a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs index f457610..75acc9a 100644 --- a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs @@ -742,18 +742,17 @@ impl K8sAnywhereTopology { labels: Some(labels.clone()), ..Default::default() }, - spec: GrafanaSpec { - config: None, - admin_user: None, - admin_password: None, - ingress: None, - persistence: None, - resources: None, - }, + spec: GrafanaSpec::default(), }; grafana } + // NOTE: This creates a harmony-owned Ingress resource, separate from the + // grafana-operator. The newer pattern (used in `ClusterDashboardsScore`) + // delegates Ingress creation to grafana-operator via `.spec.ingress` on + // the Grafana CR, using `K8sClient::get_domain()` for hostname + // resolution. This method is kept for backward compatibility with the + // `install_grafana()` flow. async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore { let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap(); let name = format!("{}-grafana", ns); @@ -1083,7 +1082,7 @@ impl K8sAnywhereTopology { if tenant.is_some() { namespace_scope = true; } - let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope) + let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope, None) .interpret(inventory, self) .await .map_err(|e| PreparationError::new(e.to_string())); @@ -1317,134 +1316,18 @@ impl TenantManager for K8sAnywhereTopology { #[async_trait] impl Ingress for K8sAnywhereTopology { async fn get_domain(&self, service: &str) -> Result { - use log::{trace, warn}; + // k3d local-dev shortcut (topology-specific state not available on K8sClient) + if let Some(Some(k8s_state)) = self.k8s_state.get() { + if matches!(k8s_state.source, K8sSource::LocalK3d) { + return Ok(format!("{service}.local.k3d")); + } + } let client = self.k8s_client().await?; - - if let Some(Some(k8s_state)) = self.k8s_state.get() { - match k8s_state.source { - K8sSource::LocalK3d => { - // Local developer UX - return Ok(format!("{service}.local.k3d")); - } - K8sSource::Kubeconfig => { - trace!("K8sSource is kubeconfig; attempting to detect domain"); - - // 1) Try OpenShift IngressController domain (backward compatible) - if self.openshift_ingress_operator_available().await.is_ok() { - trace!("OpenShift ingress operator detected; using IngressController"); - let gvk = GroupVersionKind { - group: "operator.openshift.io".into(), - version: "v1".into(), - kind: "IngressController".into(), - }; - let ic = client - .get_resource_json_value( - "default", - Some("openshift-ingress-operator"), - &gvk, - ) - .await - .map_err(|_| { - PreparationError::new( - "Failed to fetch IngressController".to_string(), - ) - })?; - - if let Some(domain) = ic.data["status"]["domain"].as_str() { - return Ok(format!("{service}.{domain}")); - } else { - warn!("OpenShift IngressController present but no status.domain set"); - } - } else { - trace!( - "OpenShift ingress operator not detected; trying generic Kubernetes" - ); - } - - // 2) Try NGINX Ingress Controller common setups - // 2.a) Well-known namespace/name for the controller Service - // - upstream default: namespace "ingress-nginx", service "ingress-nginx-controller" - // - some distros: "ingress-nginx-controller" svc in "ingress-nginx" ns - // If found with LoadBalancer ingress hostname, use its base domain. - if let Some(domain) = try_nginx_lb_domain(&client).await? { - return Ok(format!("{service}.{domain}")); - } - - // 3) Fallback: internal cluster DNS suffix (service.namespace.svc.cluster.local) - // We don't have tenant namespace here, so we fallback to 'default' with a warning. - warn!( - "Could not determine external ingress domain; falling back to internal-only DNS" - ); - let internal = format!("{service}.default.svc.cluster.local"); - Ok(internal) - } - } - } else { - Err(PreparationError::new( - "Cannot get domain: unable to detect K8s state".to_string(), - )) - } - } -} - -async fn try_nginx_lb_domain(client: &K8sClient) -> Result, PreparationError> { - use log::{debug, trace}; - - // Try common service path: svc/ingress-nginx-controller in ns/ingress-nginx - let svc_gvk = GroupVersionKind { - group: "".into(), // core - version: "v1".into(), - kind: "Service".into(), - }; - - let candidates = [ - ("ingress-nginx", "ingress-nginx-controller"), - ("ingress-nginx", "ingress-nginx-controller-internal"), - ("ingress-nginx", "ingress-nginx"), // some charts name the svc like this - ("kube-system", "ingress-nginx-controller"), // less common but seen - ]; - - for (ns, name) in candidates { - trace!("Checking NGINX Service {ns}/{name} for LoadBalancer hostname"); - if let Ok(svc) = client - .get_resource_json_value(ns, Some(name), &svc_gvk) + client + .get_domain(service) .await - { - let lb_hosts = svc.data["status"]["loadBalancer"]["ingress"] - .as_array() - .cloned() - .unwrap_or_default(); - for entry in lb_hosts { - if let Some(host) = entry.get("hostname").and_then(|v| v.as_str()) { - debug!("Found NGINX LB hostname: {host}"); - if let Some(domain) = extract_base_domain(host) { - return Ok(Some(domain.to_string())); - } else { - return Ok(Some(host.to_string())); // already a domain - } - } - if let Some(ip) = entry.get("ip").and_then(|v| v.as_str()) { - // If only an IP is exposed, we can't create a hostname; return None to keep searching - debug!("NGINX LB exposes IP {ip} (no hostname); skipping"); - } - } - } - } - - Ok(None) -} - -fn extract_base_domain(host: &str) -> Option { - // For a host like a1b2c3d4e5f6abcdef.elb.amazonaws.com -> base domain elb.amazonaws.com - // For a managed DNS like xyz.example.com -> base domain example.com (keep 2+ labels) - // Heuristic: keep last 2 labels by default; special-case known multi-label TLDs if needed. - let parts: Vec<&str> = host.split('.').collect(); - if parts.len() >= 2 { - // Very conservative: last 2 labels - Some(parts[parts.len() - 2..].join(".")) - } else { - None + .map_err(|e| PreparationError::new(e.to_string())) } } diff --git a/harmony/src/modules/helm/chart.rs b/harmony/src/modules/helm/chart.rs index d447126..cbdc7cb 100644 --- a/harmony/src/modules/helm/chart.rs +++ b/harmony/src/modules/helm/chart.rs @@ -60,7 +60,69 @@ impl Score for HelmChartScore { pub struct HelmChartInterpret { pub score: HelmChartScore, } +#[derive(serde::Deserialize)] +struct HelmListEntry { + name: String, + chart: String, +} + impl HelmChartInterpret { + fn find_installed_release( + &self, + topology: &T, + ns: &str, + ) -> Result, InterpretError> { + let release = self.score.release_name.to_string(); + let filter = format!("^{}$", release); + let args = vec!["list", "--namespace", ns, "--filter", &filter, "-o", "json"]; + let output = run_helm_command(topology, &args)?; + if !output.status.success() { + return Err(InterpretError::new(format!( + "helm list failed: {}", + String::from_utf8_lossy(&output.stderr) + ))); + } + let entries: Vec = serde_json::from_slice(&output.stdout) + .map_err(|e| InterpretError::new(format!("parse helm list output: {e}")))?; + Ok(entries + .into_iter() + .find(|e| e.name == release) + .map(|e| e.chart)) + } + + fn expected_chart_field(&self) -> Option { + let version = self.score.chart_version.as_ref()?.to_string(); + let short = self + .score + .chart_name + .to_string() + .rsplit('/') + .next() + .unwrap_or("") + .to_string(); + Some(format!( + "{short}-{}", + version.strip_prefix('v').unwrap_or(&version) + )) + } + + fn normalize_chart_field(s: &str) -> String { + // Helm strips a leading `v` from chart versions in the `chart` column + // (normalized to semver). Users often write `v5.22.2` on the score. + // Normalize both sides by dropping a `-v` → `-` before the version. + match s.rfind("-v") { + Some(i) + if s[i + 2..] + .chars() + .next() + .is_some_and(|c| c.is_ascii_digit()) => + { + format!("{}-{}", &s[..i], &s[i + 2..]) + } + _ => s.to_string(), + } + } + fn add_repo(&self, topology: &T) -> Result<(), InterpretError> { let repo = match &self.score.repository { Some(repo) => repo, @@ -142,6 +204,41 @@ impl Interpret for HelmChartInterpret { .as_ref() .unwrap_or_else(|| todo!("Get namespace from active kubernetes cluster")); + let ns_str = ns.to_string(); + if let Some(installed_chart) = self.find_installed_release(topology, &ns_str)? { + return match self.expected_chart_field() { + Some(expected) + if Self::normalize_chart_field(&expected) + == Self::normalize_chart_field(&installed_chart) => + { + warn!( + "Helm release '{}' already installed at desired version ('{}'); skipping.", + self.score.release_name, installed_chart + ); + Ok(Outcome::success(format!( + "Helm Chart {} already at desired version", + self.score.release_name + ))) + } + Some(expected) => Err(InterpretError::new(format!( + "Helm release '{}' already installed as '{}', but score requests '{}'. \ + Refusing to upgrade/downgrade; resolve manually.", + self.score.release_name, installed_chart, expected + ))), + None => { + warn!( + "Helm release '{}' already installed as '{}'; score has no pinned \ + chart_version so skipping re-install.", + self.score.release_name, installed_chart + ); + Ok(Outcome::success(format!( + "Helm Chart {} already installed (version not pinned)", + self.score.release_name + ))) + } + }; + } + self.add_repo(topology)?; let mut args = if self.score.install_only { diff --git a/harmony/src/modules/monitoring/cluster_dashboards/ceph_01-ServiceMonitor.yaml b/harmony/src/modules/monitoring/cluster_dashboards/ceph_01-ServiceMonitor.yaml new file mode 100644 index 0000000..b445dff --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/ceph_01-ServiceMonitor.yaml @@ -0,0 +1,49 @@ +# These are probably already created by rook-ceph operator, not sure, needs to validate. +# in fact, 100% sure for the second one (rook-ceph-exporter) +# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working +# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rook-ceph-mgr + namespace: rook-ceph + labels: + # This specific label is what tells OKD's Prometheus to pick this up + openshift.io/cluster-monitoring: "true" +spec: + namespaceSelector: + matchNames: + - rook-ceph + selector: + matchLabels: + # This matches your 'rook-ceph-mgr' service + app: rook-ceph-mgr + endpoints: + - port: "" + # The port name in your service is empty/integers, so we use targetPort + targetPort: 9283 + path: /metrics + interval: 30s +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: rook-ceph-exporter + namespace: rook-ceph + labels: + # This label is required for OKD cluster-wide monitoring to pick it up + openshift.io/cluster-monitoring: "true" + team: rook +spec: + endpoints: + - honorLabels: true + interval: 10s + path: /metrics + port: ceph-exporter-http-metrics + namespaceSelector: + matchNames: + - rook-ceph + selector: + matchLabels: + app: rook-ceph-exporter + rook_cluster: rook-ceph diff --git a/harmony/src/modules/monitoring/cluster_dashboards/ceph_02-RBAC.yaml b/harmony/src/modules/monitoring/cluster_dashboards/ceph_02-RBAC.yaml new file mode 100644 index 0000000..0564fa8 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/ceph_02-RBAC.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: rook-ceph-metrics-viewer + namespace: rook-ceph +rules: +- apiGroups: [""] + resources: ["services", "endpoints", "pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: rook-ceph-metrics-viewer + namespace: rook-ceph +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: rook-ceph-metrics-viewer +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: openshift-monitoring diff --git a/harmony/src/modules/monitoring/cluster_dashboards/ceph_03-NamespaceLabel.yaml b/harmony/src/modules/monitoring/cluster_dashboards/ceph_03-NamespaceLabel.yaml new file mode 100644 index 0000000..1134ff7 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/ceph_03-NamespaceLabel.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: rook-ceph + labels: + # This is the critical label that allows OKD Prometheus to see the namespace + openshift.io/cluster-monitoring: "true" diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/alerts-events-problems.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/alerts-events-problems.json new file mode 100644 index 0000000..3132e92 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/alerts-events-problems.json @@ -0,0 +1,731 @@ +{ + "title": "Alerts & Events — Active Problems", + "uid": "okd-alerts-events", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-3h", "to": "now" }, + "tags": ["okd", "alerts", "events"], + "templating": { + "list": [ + { + "name": "severity", + "type": "custom", + "label": "Severity Filter", + "query": "critical,warning,info", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "includeAll": true, + "allValue": "critical|warning|info", + "multi": false, + "options": [ + { "selected": true, "text": "All", "value": "$__all" }, + { "selected": false, "text": "Critical", "value": "critical" }, + { "selected": false, "text": "Warning", "value": "warning" }, + { "selected": false, "text": "Info", "value": "info" } + ] + }, + { + "name": "namespace", + "type": "query", + "label": "Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "allValue": ".*", + "multi": true, + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Critical Alerts Firing", + "description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Warning Alerts Firing", + "description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "orange", "value": 5 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing", + "description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "blue", "value": 1 }, + { "color": "blue", "value": 25 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)", + "description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 20 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "CrashLoopBackOff Pods", + "description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "OOMKilled Containers", + "description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "NotReady Nodes", + "description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)", + "description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Alert Overview", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time", + "description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})", + "refId": "A", + "legendFormat": "{{severity}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration", + "description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" }, + { "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { + "matcher": { "id": "byFrameRefID", "options": "B" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }, + { "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } }, + { "id": "custom.lineWidth", "value": 1 } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts", + "description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})", + "refId": "A", + "legendFormat": "{{alertname}} · {{severity}} · {{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "orange", "value": 1800 }, + { "color": "red", "value": 7200 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true, + "valueMode": "color" + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 14, "type": "table", "title": "All Firing Alerts", + "description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}", + "refId": "A", + "instant": true, + "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "alertstate": true, + "__name__": true, + "Value": true, + "Time": true + }, + "renameByName": { + "alertname": "Alert Name", + "severity": "Severity", + "namespace": "Namespace", + "pod": "Pod", + "node": "Node", + "container": "Container", + "job": "Job", + "service": "Service", + "reason": "Reason", + "instance": "Instance" + }, + "indexByName": { + "severity": 0, + "alertname": 1, + "namespace": 2, + "pod": 3, + "node": 4, + "container": 5, + "job": 6, + "service": 7, + "reason": 8, + "instance": 9 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 110 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 }, + "warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 }, + "info": { "text": "INFO", "color": "dark-blue", "index": 2 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] }, + { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] }, + { "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] }, + { "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 } + }, + + { + "id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 } + }, + + { + "id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason", + "description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))", + "refId": "A", + "legendFormat": "{{reason}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 } + }, + + { + "id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)", + "description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "orange", "value": 50 }, + { "color": "red", "value": 200 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 } + }, + + { + "id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time", + "description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))", + "refId": "A", + "legendFormat": "{{reason}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 } + }, + + { + "id": 19, "type": "row", "title": "Pod Problems", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 } + }, + + { + "id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace", + "description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 } + }, + + { + "id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace", + "description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 } + }, + + { + "id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)", + "description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" }, + { "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 } + }, + + { + "id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 } + }, + + { + "id": 24, "type": "table", "title": "Node Condition Status Matrix", + "description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "kube_node_status_condition == 1", + "refId": "A", + "instant": true, + "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true + }, + "renameByName": { + "node": "Node", + "condition": "Condition", + "status": "Status" + }, + "indexByName": { "node": 0, "condition": 1, "status": 2 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Status" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 90 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "true": { "text": "true", "color": "green", "index": 0 }, + "false": { "text": "false", "color": "dark-red", "index": 1 }, + "unknown": { "text": "unknown", "color": "dark-orange", "index": 2 } + } + }] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Condition" }, + "properties": [ + { "id": "custom.width", "value": 190 }, + { "id": "custom.displayMode", "value": "color-text" }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "Ready": { "color": "green", "index": 0 }, + "MemoryPressure": { "color": "red", "index": 1 }, + "DiskPressure": { "color": "red", "index": 2 }, + "PIDPressure": { "color": "red", "index": 3 }, + "NetworkUnavailable": { "color": "red", "index": 4 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Node" }], + "footer": { "show": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 } + }, + + { + "id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)", + "description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1", + "refId": "A", + "instant": true, + "legendFormat": "" + }, + { + "expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1", + "refId": "B", + "instant": true, + "legendFormat": "" + } + ], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true, + "namespace": true + }, + "renameByName": { + "name": "Operator", + "condition": "Condition", + "reason": "Reason" + }, + "indexByName": { "name": 0, "condition": 1, "reason": 2 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Condition" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 140 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 }, + "Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] }, + { "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Condition" }], + "footer": { "show": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json new file mode 100644 index 0000000..43079ce --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/cluster-overview.json @@ -0,0 +1,739 @@ +{ + "title": "Cluster Overview", + "uid": "okd-cluster-overview", + "schemaVersion": 36, + "version": 2, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "cluster", "overview"], + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + { + "id": 2, + "type": "stat", + "title": "Not Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + { + "id": 3, + "type": "stat", + "title": "Running Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + { + "id": 4, + "type": "stat", + "title": "Pending Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + { + "id": 5, + "type": "stat", + "title": "Failed Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + { + "id": 6, + "type": "stat", + "title": "CrashLoopBackOff", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + { + "id": 7, + "type": "stat", + "title": "Critical Alerts", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + { + "id": 8, + "type": "stat", + "title": "Warning Alerts", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + { + "id": 9, + "type": "gauge", + "title": "CPU Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "CPU" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 } + }, + { + "id": 10, + "type": "gauge", + "title": "Memory Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))", + "refId": "A", + "legendFormat": "Memory" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 75 }, + { "color": "red", "value": 90 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 } + }, + { + "id": 11, + "type": "gauge", + "title": "Root Disk Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))", + "refId": "A", + "legendFormat": "Disk" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 } + }, + { + "id": 12, + "type": "stat", + "title": "etcd Has Leader", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "min(etcd_server_has_leader)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "LEADER OK", "color": "green" } + } + } + ], + "unit": "short", + "noValue": "?" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 } + }, + { + "id": 13, + "type": "stat", + "title": "API Servers Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(up{job=\"apiserver\"})", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 2 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 } + }, + { + "id": 14, + "type": "stat", + "title": "etcd Members Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(up{job=\"etcd\"})", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "green", "value": 3 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 } + }, + { + "id": 15, + "type": "stat", + "title": "Operators Degraded", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 } + }, + { + "id": 16, + "type": "timeseries", + "title": "CPU Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 } + }, + { + "id": 17, + "type": "timeseries", + "title": "Memory Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 } + }, + { + "id": 18, + "type": "timeseries", + "title": "Network Traffic — Cluster Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))", + "refId": "A", + "legendFormat": "Receive" + }, + { + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))", + "refId": "B", + "legendFormat": "Transmit" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Receive" }, + "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Transmit" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 } + }, + { + "id": 19, + "type": "timeseries", + "title": "Pod Phases Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)", + "refId": "A", + "legendFormat": "Running" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)", + "refId": "B", + "legendFormat": "Pending" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)", + "refId": "C", + "legendFormat": "Failed" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)", + "refId": "D", + "legendFormat": "Unknown" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": false, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Running" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Pending" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Failed" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Unknown" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["lastNotNull"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 } + } + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/control-plane.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/control-plane.json new file mode 100644 index 0000000..921085d --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/control-plane.json @@ -0,0 +1,742 @@ +{ + "title": "Control Plane Health", + "uid": "okd-control-plane", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "control-plane"], + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "API Server Instance", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "API Servers Up", + "description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Controller Managers Up", + "description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Schedulers Up", + "description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "API 5xx Rate", + "description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 1 } + ]}, + "unit": "reqps", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "Inflight — Mutating", + "description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 500 }, + { "color": "orange", "value": 750 }, + { "color": "red", "value": 900 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "Inflight — Read-Only", + "description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1500 }, + { "color": "orange", "value": 2200 }, + { "color": "red", "value": 2700 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)", + "description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "APIServer → etcd p99", + "description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.05 }, + { "color": "orange", "value": 0.2 }, + { "color": "red", "value": 0.5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Request Rate by Verb", + "description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code", + "description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only", + "description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)", + "description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 } + }, + + { + "id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb", + "description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 } + }, + + { + "id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation", + "description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" }, + { "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 } + }, + + { + "id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource", + "description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})", + "refId": "A", "legendFormat": "{{resource}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 } + }, + + { + "id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind", + "description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{kind}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 } + }, + + { + "id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind", + "description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" }, + { "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 } + }, + + { + "id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 } + }, + + { + "id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name", + "description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{type}} — {{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 } + }, + + { + "id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name", + "description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "red", "value": 2.0 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 } + }, + + { + "id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name", + "description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{name}} ({{error_type}})" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 } + }, + + { + "id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 } + }, + + { + "id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller", + "description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 } + }, + + { + "id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller", + "description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 } + }, + + { + "id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller", + "description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 } + }, + + { + "id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 } + }, + + { + "id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result", + "description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))", + "refId": "A", "legendFormat": "{{result}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 } + }, + + { + "id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99", + "description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 } + }, + + { + "id": 32, "type": "timeseries", "title": "Pending Pods by Queue", + "description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(queue)(scheduler_pending_pods)", + "refId": "A", "legendFormat": "{{queue}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 } + }, + + { + "id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } + }, + + { + "id": 34, "type": "timeseries", "title": "CPU Usage by Component", + "description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 } + }, + + { + "id": 35, "type": "timeseries", "title": "RSS Memory by Component", + "description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 } + }, + + { + "id": 36, "type": "timeseries", "title": "Goroutines by Component", + "description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/etcd.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/etcd.json new file mode 100644 index 0000000..93ac55e --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/etcd.json @@ -0,0 +1,734 @@ +{ + "title": "etcd", + "uid": "okd-etcd", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "etcd"], + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Instance", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Cluster Members", + "description": "Total number of etcd members currently reporting metrics.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Has Leader", + "description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "unit": "short", "noValue": "0", + "mappings": [ + { "type": "value", "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "OK", "color": "green" } + }} + ] + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Leader Changes (1h)", + "description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "DB Size (Max)", + "description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2147483648 }, + { "color": "orange", "value": 5368709120 }, + { "color": "red", "value": 7516192768 } + ]}, + "unit": "bytes", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "DB Fragmentation (Max)", + "description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 25 }, + { "color": "orange", "value": 50 }, + { "color": "red", "value": 75 } + ]}, + "unit": "percent", "noValue": "0", "decimals": 1 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "Failed Proposals/s", + "description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]}, + "unit": "short", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "WAL Fsync p99", + "description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 0.5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Backend Commit p99", + "description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.025 }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 0.25 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Cluster Health", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Has Leader per Instance", + "description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_has_leader{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "max": 1.1, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }, + "mappings": [ + { "type": "value", "options": { + "0": { "text": "0 — no leader" }, + "1": { "text": "1 — ok" } + }} + ] + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": [] } + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)", + "description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "timeseries", "title": "Slow Operations", + "description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" }, + { "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" }, + { "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 } + }, + + { + "id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method", + "description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))", + "refId": "A", "legendFormat": "{{grpc_method}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 } + }, + + { + "id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code", + "description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))", + "refId": "A", "legendFormat": "{{grpc_code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 } + }, + + { + "id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)", + "description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 } + }, + + { + "id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 } + }, + + { + "id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied", + "description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" }, + { "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 } + }, + + { + "id": 19, "type": "timeseries", "title": "Proposals Pending", + "description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_proposals_pending{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line+area" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 10 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 } + }, + + { + "id": 20, "type": "timeseries", "title": "Failed Proposals Rate", + "description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 } + }, + + { + "id": 21, "type": "row", "title": "Disk I/O", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 } + }, + + { + "id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance", + "description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" }, + { "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" }, + { "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 } + }, + + { + "id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance", + "description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" }, + { "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" }, + { "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 } + }, + + { + "id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } + }, + + { + "id": 25, "type": "timeseries", "title": "Peer RX Rate", + "description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 } + }, + + { + "id": 26, "type": "timeseries", "title": "Peer TX Rate", + "description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 } + }, + + { + "id": 27, "type": "timeseries", "title": "Client gRPC Received", + "description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 } + }, + + { + "id": 28, "type": "timeseries", "title": "Client gRPC Sent", + "description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 } + }, + + { + "id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } + }, + + { + "id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance", + "description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" }, + { "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 } + }, + + { + "id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)", + "description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 } + }, + + { + "id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit", + "description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" }, + { "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": "^Limit.*" }, + "properties": [ + { "id": "custom.lineWidth", "value": 1 }, + { "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } }, + { "id": "custom.fillOpacity","value": 0 } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 } + }, + + { + "id": 33, "type": "row", "title": "Snapshots", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 } + }, + + { + "id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)", + "description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 } + }, + + { + "id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)", + "description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/networking.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/networking.json new file mode 100644 index 0000000..88314d2 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/networking.json @@ -0,0 +1,945 @@ +{ + "title": "Networking", + "uid": "okd-networking", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "networking"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Network RX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Network TX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "RX Errors/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "TX Errors/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "RX Drops/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "TX Drops/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "DNS Queries/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_dns_requests_total[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "reqps", "noValue": "0", "decimals": 1 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "DNS Error %", + "description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "percent", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Network I/O", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Receive Rate by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 } + }, + + { + "id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))", + "refId": "A", "legendFormat": "{{namespace}} / {{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 } + }, + + { + "id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))", + "refId": "A", "legendFormat": "{{namespace}} / {{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 } + }, + + { + "id": 15, + "type": "table", + "title": "Pod Network I/O Summary", + "description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "B", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "C", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "D", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "E", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "F", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["namespace", "pod", "Value"] } } + }, + { + "id": "joinByField", + "options": { "byField": "pod", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "namespace 5": true + }, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "Value": "RX Rate", + "Value 1": "TX Rate", + "Value 2": "RX Errors/s", + "Value 3": "TX Errors/s", + "Value 4": "RX Drops/s", + "Value 5": "TX Drops/s" + }, + "indexByName": { + "namespace": 0, + "pod": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5, + "Value 4": 6, + "Value 5": 7 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "RX Rate", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] + }, + { + "matcher": { "id": "byName", "options": "Pod" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" }, + "properties": [ + { "id": "unit", "value": "Bps" }, + { "id": "custom.displayMode", "value": "color-background-solid" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10000000 }, + { "color": "orange", "value": 100000000 }, + { "color": "red", "value": 500000000 } + ]}} + ] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" }, + "properties": [ + { "id": "unit", "value": "pps" }, + { "id": "decimals", "value": 3 }, + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]}} + ] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" }, + "properties": [ + { "id": "unit", "value": "pps" }, + { "id": "decimals", "value": 3 }, + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 0.001 } + ]}} + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 } + }, + + { + "id": 17, "type": "timeseries", "title": "RX Errors by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 } + }, + + { + "id": 18, "type": "timeseries", "title": "TX Errors by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 } + }, + + { + "id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 } + }, + + { + "id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 } + }, + + { + "id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } + }, + + { + "id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))", + "refId": "A", "legendFormat": "{{type}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 } + }, + + { + "id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode", + "description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))", + "refId": "A", "legendFormat": "{{rcode}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 } + }, + + { + "id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "B", "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "C", "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 } + }, + + { + "id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)", + "description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100", + "refId": "A", "legendFormat": "Cache Hit %" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 80 } + ]}, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "single" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 } + }, + + { + "id": 26, "type": "timeseries", "title": "DNS Forward Request Rate", + "description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(coredns_forward_requests_total[5m]))", + "refId": "A", "legendFormat": "Forward Requests/s" + }, + { + "expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))", + "refId": "B", "legendFormat": "Forward Responses/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 } + }, + + { + "id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 } + }, + + { + "id": 28, "type": "stat", "title": "Total Services", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(kube_service_info{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 } + }, + + { + "id": 29, "type": "stat", "title": "Endpoint Addresses Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 } + }, + + { + "id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 } + }, + + { + "id": 31, + "type": "table", + "title": "Endpoint Availability", + "description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})", + "refId": "B", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["namespace", "endpoint", "Value"] } } + }, + { + "id": "joinByField", + "options": { "byField": "endpoint", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { "namespace 1": true }, + "renameByName": { + "namespace": "Namespace", + "endpoint": "Endpoint", + "Value": "Available", + "Value 1": "Not Ready" + }, + "indexByName": { + "namespace": 0, + "endpoint": 1, + "Value": 2, + "Value 1": 3 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Not Ready", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "Endpoint" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }] + }, + { + "matcher": { "id": "byName", "options": "Available" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Not Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 } + }, + + { + "id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 } + }, + + { + "id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code", + "description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 } + }, + + { + "id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)", + "description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100", + "refId": "A", "legendFormat": "4xx %" + }, + { + "expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100", + "refId": "B", "legendFormat": "5xx %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 } + }, + + { + "id": 35, "type": "timeseries", "title": "Router Bytes In / Out", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))", + "refId": "A", "legendFormat": "Bytes In" + }, + { + "expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))", + "refId": "B", "legendFormat": "Bytes Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 } + }, + + { + "id": 36, + "type": "table", + "title": "Router Backend Server Status", + "description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "haproxy_server_up", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["proxy", "server", "Value"] } } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "renameByName": { + "proxy": "Backend", + "server": "Server", + "Value": "Status" + }, + "indexByName": { "proxy": 0, "server": 1, "Value": 2 } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Status", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Backend" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byName", "options": "Server" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "Status" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "mappings", "value": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, + { "type": "value", "options": { "1": { "text": "UP", "color": "green" } } } + ]}, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}} + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json new file mode 100644 index 0000000..0b2fe9d --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/nodes-health.json @@ -0,0 +1,627 @@ +{ + "title": "Node Health", + "uid": "okd-node-health", + "schemaVersion": 36, + "version": 2, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "node", "health"], + "templating": { + "list": [ + { + "name": "node", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_node_info, node)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Node", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, + "type": "stat", + "title": "Total Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, + "type": "stat", + "title": "Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, + "type": "stat", + "title": "Not Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, + "type": "stat", + "title": "Memory Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, + "type": "stat", + "title": "Disk Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, + "type": "stat", + "title": "PID Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, + "type": "stat", + "title": "Unschedulable", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, + "type": "stat", + "title": "Kubelet Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, + "type": "table", + "title": "Node Conditions", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})", + "refId": "A", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})", + "refId": "B", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})", + "refId": "C", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})", + "refId": "D", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})", + "refId": "E", + "legendFormat": "{{node}}", + "instant": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": { "mode": "columns" } + }, + { + "id": "joinByField", + "options": { "byField": "node", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true + }, + "renameByName": { + "node": "Node", + "Value #A": "Ready", + "Value #B": "Mem Pressure", + "Value #C": "Disk Pressure", + "Value #D": "PID Pressure", + "Value #E": "Unschedulable" + }, + "indexByName": { + "node": 0, + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "displayMode": "color-background", "align": "center" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Node" }, + "properties": [ + { "id": "custom.displayMode", "value": "auto" }, + { "id": "custom.align", "value": "left" }, + { "id": "custom.width", "value": 200 } + ] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✗ Not Ready", "color": "red", "index": 0 }, + "1": { "text": "✓ Ready", "color": "green", "index": 1 } + } + } + ] + } + ] + }, + { + "matcher": { "id": "byRegexp", "options": ".*Pressure" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✓ OK", "color": "green", "index": 0 }, + "1": { "text": "⚠ Active", "color": "red", "index": 1 } + } + } + ] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Unschedulable" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✓ Schedulable", "color": "green", "index": 0 }, + "1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 } + } + } + ] + } + ] + } + ] + }, + "options": { "sortBy": [{ "displayName": "Node", "desc": false }] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, + "type": "timeseries", + "title": "CPU Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 } + }, + + { + "id": 11, + "type": "bargauge", + "title": "CPU Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 } + }, + + { + "id": 12, + "type": "timeseries", + "title": "Memory Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 } + }, + + { + "id": 13, + "type": "bargauge", + "title": "Memory Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 } + }, + + { + "id": 14, + "type": "timeseries", + "title": "Root Disk Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 } + }, + + { + "id": 15, + "type": "bargauge", + "title": "Root Disk Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 } + }, + + { + "id": 16, + "type": "timeseries", + "title": "Network Traffic per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))", + "refId": "A", + "legendFormat": "rx {{instance}}" + }, + { + "expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))", + "refId": "B", + "legendFormat": "tx {{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 } + }, + + { + "id": 17, + "type": "bargauge", + "title": "Pods per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count by(node) (kube_pod_info{node=~\"$node\"})", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 200 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 } + }, + + { + "id": 18, + "type": "timeseries", + "title": "System Load Average (1m) per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "node_load1", + "refId": "A", + "legendFormat": "1m \u2014 {{instance}}" + }, + { + "expr": "node_load5", + "refId": "B", + "legendFormat": "5m \u2014 {{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 } + }, + + { + "id": 19, + "type": "bargauge", + "title": "Node Uptime", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "time() - node_boot_time_seconds", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "green", "value": 3600 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": false, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json new file mode 100644 index 0000000..3c58184 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/storage.json @@ -0,0 +1,596 @@ +{ + "title": "Storage Health", + "uid": "storage-health", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + + { + "type": "row", + "id": 1, + "title": "PVC / PV Status", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "type": "stat", + "id": 2, + "title": "Bound PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } + }, + + { + "type": "stat", + "id": 3, + "title": "Pending PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } + }, + + { + "type": "stat", + "id": 4, + "title": "Lost PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 } + }, + + { + "type": "stat", + "id": 5, + "title": "Bound PVs / Available PVs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A", + "legendFormat": "Bound" + }, + { + "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", + "refId": "B", + "legendFormat": "Available" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 } + }, + + { + "type": "stat", + "id": 6, + "title": "Ceph Cluster Health", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_health_status", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 2 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "HEALTH_OK", "index": 0 }, + "1": { "text": "HEALTH_WARN", "index": 1 }, + "2": { "text": "HEALTH_ERR", "index": 2 } + } + } + ] + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "value" + }, + "gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 } + }, + + { + "type": "stat", + "id": 7, + "title": "OSDs Up / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(ceph_osd_up) or vector(0)", + "refId": "A", + "legendFormat": "Up" + }, + { + "expr": "count(ceph_osd_metadata) or vector(0)", + "refId": "B", + "legendFormat": "Total" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 } + }, + + { + "type": "row", + "id": 8, + "title": "Cluster Capacity", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + + { + "type": "gauge", + "id": 9, + "title": "Ceph Cluster Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + }, + + { + "type": "stat", + "id": 10, + "title": "Ceph Capacity — Total / Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_cluster_total_bytes", + "refId": "A", + "legendFormat": "Total" + }, + { + "expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", + "refId": "B", + "legendFormat": "Available" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto", + "orientation": "vertical" + }, + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + }, + + { + "type": "bargauge", + "id": 11, + "title": "PV Allocated Capacity by Storage Class (Bound)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)", + "refId": "A", + "legendFormat": "{{storageclass}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 } + }, + + { + "type": "piechart", + "id": 12, + "title": "PVC Phase Distribution", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A", + "legendFormat": "Bound" + }, + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", + "refId": "B", + "legendFormat": "Pending" + }, + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", + "refId": "C", + "legendFormat": "Lost" + } + ], + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" } } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "pieType": "pie", + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 } + }, + + { + "type": "row", + "id": 13, + "title": "Ceph Performance", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 } + }, + + { + "type": "timeseries", + "id": 14, + "title": "Ceph Pool IOPS (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "rate(ceph_pool_rd[5m])", + "refId": "A", + "legendFormat": "Read — pool {{pool_id}}" + }, + { + "expr": "rate(ceph_pool_wr[5m])", + "refId": "B", + "legendFormat": "Write — pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 } + }, + + { + "type": "timeseries", + "id": 15, + "title": "Ceph Pool Throughput (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "rate(ceph_pool_rd_bytes[5m])", + "refId": "A", + "legendFormat": "Read — pool {{pool_id}}" + }, + { + "expr": "rate(ceph_pool_wr_bytes[5m])", + "refId": "B", + "legendFormat": "Write — pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 } + }, + + { + "type": "row", + "id": 16, + "title": "Ceph OSD & Pool Details", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 } + }, + + { + "type": "timeseries", + "id": 17, + "title": "Ceph Pool Space Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)", + "refId": "A", + "legendFormat": "Pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "custom": { "lineWidth": 2, "fillOpacity": 10 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 } + }, + + { + "type": "bargauge", + "id": 18, + "title": "OSD Status per Daemon (green = Up, red = Down)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_osd_up", + "refId": "A", + "legendFormat": "{{ceph_daemon}}" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 1, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "index": 0 }, + "1": { "text": "UP", "index": 1 } + } + } + ] + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "basic", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 } + }, + + { + "type": "row", + "id": 19, + "title": "Node Disk Usage", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 } + }, + + { + "type": "timeseries", + "id": 20, + "title": "Node Root Disk Usage Over Time (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "custom": { "lineWidth": 2, "fillOpacity": 10 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 } + }, + + { + "type": "bargauge", + "id": 21, + "title": "Current Disk Usage — All Nodes & Mountpoints", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)", + "refId": "A", + "legendFormat": "{{instance}} — {{mountpoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dashboards/workloads-health.json b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/workloads-health.json new file mode 100644 index 0000000..60219ae --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dashboards/workloads-health.json @@ -0,0 +1,773 @@ +{ + "title": "Workload Health", + "uid": "okd-workload-health", + "schemaVersion": 36, + "version": 3, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "workload", "health"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Total Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Running Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Pending Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "Failed Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "CrashLoopBackOff", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "OOMKilled", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "Deployments Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Deployments Degraded", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Deployments", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, + "type": "table", + "title": "Deployment Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})", + "refId": "E", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "deployment", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "deployment", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true + }, + "renameByName": { + "namespace": "Namespace", + "deployment": "Deployment", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Available", + "Value 3": "Unavailable", + "Value 4": "Up-to-date" + }, + "indexByName": { + "namespace": 0, + "deployment": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5, + "Value 4": 6 + } + } + }, + { + "id": "sortBy", + "options": { + "fields": [{ "displayName": "Namespace", "desc": false }] + } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Deployment" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }] + }, + { + "matcher": { "id": "byName", "options": "Unavailable" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + ] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + } + ] + } + ] + }, + "options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 12, + "type": "table", + "title": "StatefulSet Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "statefulset", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "statefulset", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true + }, + "renameByName": { + "namespace": "Namespace", + "statefulset": "StatefulSet", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Current", + "Value 3": "Up-to-date" + }, + "indexByName": { + "namespace": 0, + "statefulset": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Namespace", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "StatefulSet" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 } + }, + + { + "id": 13, + "type": "table", + "title": "DaemonSet Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "daemonset", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "daemonset", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true + }, + "renameByName": { + "namespace": "Namespace", + "daemonset": "DaemonSet", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Unavailable", + "Value 3": "Misscheduled" + }, + "indexByName": { + "namespace": 0, + "daemonset": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Namespace", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "DaemonSet" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Unavailable" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Misscheduled" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 } + }, + + { + "id": 14, "type": "row", "title": "Pods", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 15, + "type": "timeseries", + "title": "Pod Phase over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 } + }, + + { + "id": 16, + "type": "piechart", + "title": "Pod Phase — Now", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})", + "refId": "A", "instant": true, "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "pieType": "donut", + "tooltip": { "mode": "single" }, + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 } + }, + + { + "id": 17, + "type": "timeseries", + "title": "Container Restarts over Time (total counter, top 10)", + "description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 } + }, + + { + "id": 18, + "type": "table", + "title": "Container Total Restarts (non-zero)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { "names": ["namespace", "pod", "container", "Value"] } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "container": "Container", + "Value": "Total Restarts" + }, + "indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] }, + { "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] }, + { "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] }, + { + "matcher": { "id": "byName", "options": "Total Restarts" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 } + }, + + { + "id": 19, "type": "row", "title": "Resource Usage", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 } + }, + + { + "id": 20, + "type": "timeseries", + "title": "CPU Usage by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 } + }, + + { + "id": 21, + "type": "timeseries", + "title": "Memory Usage by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 } + }, + + { + "id": 22, + "type": "bargauge", + "title": "CPU — Actual vs Requested (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 150, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] } + } + }, + "options": { + "orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 } + }, + + { + "id": 23, + "type": "bargauge", + "title": "Memory — Actual vs Requested (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 150, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] } + } + }, + "options": { + "orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 } + } + + ] +} diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/01-namespace.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/01-namespace.yaml new file mode 100644 index 0000000..a52fe20 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/01-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: observability + labels: + openshift.io/cluster-monitoring: "true" diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/02-serviceaccount-rbac.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/02-serviceaccount-rbac.yaml new file mode 100644 index 0000000..cfaa8f0 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/02-serviceaccount-rbac.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cluster-grafana-sa + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: grafana-prometheus-api-access +rules: +- apiGroups: + - monitoring.coreos.com + resources: + - prometheuses/api + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grafana-prometheus-api-access-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: grafana-prometheus-api-access +subjects: +- kind: ServiceAccount + name: cluster-grafana-sa + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: grafana-cluster-monitoring-view +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-monitoring-view +subjects: +- kind: ServiceAccount + name: cluster-grafana-sa + namespace: observability diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03-grafana.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03-grafana.yaml new file mode 100644 index 0000000..f98bef1 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03-grafana.yaml @@ -0,0 +1,43 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: cluster-grafana + namespace: observability + labels: + dashboards: "grafana" +spec: + serviceAccountName: cluster-grafana-sa + automountServiceAccountToken: true + + config: + log: + mode: console + + security: + admin_user: admin + admin_password: paul + + users: + viewers_can_edit: "false" + + auth: + disable_login_form: "false" + + auth.anonymous: + enabled: "true" + org_role: Viewer + + deployment: + spec: + replicas: 1 + template: + spec: + containers: + - name: grafana + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03a-secret-token.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03a-secret-token.yaml new file mode 100644 index 0000000..c57a142 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/03a-secret-token.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-prometheus-token + namespace: observability + annotations: + kubernetes.io/service-account.name: cluster-grafana-sa +type: kubernetes.io/service-account-token diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/04-datasource.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/04-datasource.yaml new file mode 100644 index 0000000..9a1ce74 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/04-datasource.yaml @@ -0,0 +1,27 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: prometheus-cluster + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + valuesFrom: + - targetPath: "secureJsonData.httpHeaderValue1" + valueFrom: + secretKeyRef: + name: grafana-prometheus-token + key: token + datasource: + name: Prometheus-Cluster + type: prometheus + access: proxy + url: https://prometheus-k8s.openshift-monitoring.svc:9091 + isDefault: true + jsonData: + httpHeaderName1: "Authorization" + tlsSkipVerify: true + timeInterval: "30s" + secureJsonData: + httpHeaderValue1: "Bearer ${token}" diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/05-route.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/05-route.yaml new file mode 100644 index 0000000..9b86b5e --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/05-route.yaml @@ -0,0 +1,14 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: grafana + namespace: observability +spec: + to: + kind: Service + name: cluster-grafana-service + port: + targetPort: 3000 + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/06-dashboard-cluster-overview.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/06-dashboard-cluster-overview.yaml new file mode 100644 index 0000000..6b55825 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/06-dashboard-cluster-overview.yaml @@ -0,0 +1,97 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cluster-overview + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + + json: | + { + "title": "Cluster Overview", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "panels": [ + { + "type": "stat", + "title": "Ready Nodes", + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "refId": "A" + } + ], + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 } + }, + { + "type": "stat", + "title": "Running Pods", + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 } + }, + { + "type": "timeseries", + "title": "Cluster CPU Usage (%)", + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "targets": [ + { + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100 + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 } + }, + { + "type": "timeseries", + "title": "Cluster Memory Usage (%)", + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "targets": [ + { + "expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100 + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 } + } + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/07-dashboard-openshift-metrics.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/07-dashboard-openshift-metrics.yaml new file mode 100644 index 0000000..5b31d2e --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/07-dashboard-openshift-metrics.yaml @@ -0,0 +1,1015 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: openshift-metrics + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Metrics Dashboard for CRI-O OpenShift clusters", + "editable": true, + "gnetId": 5273, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "description": "The number of containers that start or restart over the last ten minutes.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 27, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(changes(container_last_seen{image!=\"\"}[10m]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Container Restarts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 19, + "panels": [], + "repeat": null, + "title": "File System Space", + "type": "row" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "description": "Percentage of usage of the root filesystem on each host.", + "fill": 1, + "gridPos": { + "h": 10, + "w": 10, + "x": 0, + "y": 3 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 80 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Root Filesystem % Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": false, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "percent", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "fill": 1, + "gridPos": { + "h": 10, + "w": 10, + "x": 10, + "y": 3 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_avail_bytes{mountpoint=\"/\",device!=\"rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",device!=\"rootfs\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 80 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Root Filesystem % Used (rootfs)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": false, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "percent", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 20, + "y": 3 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_avail_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 60 + } + ], + "thresholds": "0.8,0.9", + "title": "Cluster Disk Usage (non-rootfs)", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 20, + "panels": [], + "repeat": null, + "title": "Running Pods/Containers", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "fill": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "container_last_seen{namespace!=\"\",container!=\"\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CRI-O Containers Running", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "fill": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "kubelet_running_pod_count", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 35 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pods Running", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 22, + "panels": [], + "repeat": null, + "title": "Cluster CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "fill": 1, + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 25 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (kubernetes_io_hostname,type) (rate(container_cpu_usage_seconds_total{id=\"/\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 25 + }, + "id": 25, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\"}[3m])) / sum(machine_cpu_cores)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0.7,0.9", + "title": "Cluster CPU Percentage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 23, + "panels": [], + "repeat": null, + "title": "Cluster Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "fill": 1, + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 33 + }, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 90 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Cluster Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": { + "type": "prometheus", + "uid": "Prometheus-Cluster" + }, + "decimals": null, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 33 + }, + "id": 16, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "Value", + "targets": [ + { + "expr": "sum(container_memory_rss) / sum(machine_memory_bytes)", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": "0.75, 0.9", + "title": "Cluster Memory Use Percentage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [], + "valueName": "current" + } + ], + "refresh": "5s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "kubernetes", + "cri-o", + "openshift" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "OpenShift Metrics (CRI-O)", + "uid": "jmfLePkmz", + "version": 5 + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08a-dashboard-cluster-overview.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08a-dashboard-cluster-overview.yaml new file mode 100644 index 0000000..24e5ef7 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08a-dashboard-cluster-overview.yaml @@ -0,0 +1,769 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-cluster-overview + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Cluster Overview", + "uid": "okd-cluster-overview", + "schemaVersion": 36, + "version": 2, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "cluster", "overview"], + "panels": [ + + { + "id": 1, + "type": "stat", + "title": "Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, + "type": "stat", + "title": "Not Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, + "type": "stat", + "title": "Running Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, + "type": "stat", + "title": "Pending Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, + "type": "stat", + "title": "Failed Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, + "type": "stat", + "title": "CrashLoopBackOff", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, + "type": "stat", + "title": "Critical Alerts", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, + "type": "stat", + "title": "Warning Alerts", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, + "type": "gauge", + "title": "CPU Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "CPU" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 } + }, + + { + "id": 10, + "type": "gauge", + "title": "Memory Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))", + "refId": "A", + "legendFormat": "Memory" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 75 }, + { "color": "red", "value": 90 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 } + }, + + { + "id": 11, + "type": "gauge", + "title": "Root Disk Usage", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))", + "refId": "A", + "legendFormat": "Disk" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "orientation": "auto" + }, + "gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 } + }, + + { + "id": 12, + "type": "stat", + "title": "etcd Has Leader", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "min(etcd_server_has_leader)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "LEADER OK", "color": "green" } + } + } + ], + "unit": "short", + "noValue": "?" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 } + }, + + { + "id": 13, + "type": "stat", + "title": "API Servers Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(up{job=\"apiserver\"})", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 2 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 } + }, + + { + "id": 14, + "type": "stat", + "title": "etcd Members Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(up{job=\"etcd\"})", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 2 }, + { "color": "green", "value": 3 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 } + }, + + { + "id": 15, + "type": "stat", + "title": "Operators Degraded", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)", + "refId": "A", + "legendFormat": "" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short", + "noValue": "0" + } + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 } + }, + + { + "id": 16, + "type": "timeseries", + "title": "CPU Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 } + }, + + { + "id": 17, + "type": "timeseries", + "title": "Memory Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 } + }, + + { + "id": 18, + "type": "timeseries", + "title": "Network Traffic — Cluster Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))", + "refId": "A", + "legendFormat": "Receive" + }, + { + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))", + "refId": "B", + "legendFormat": "Transmit" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Receive" }, + "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Transmit" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "max"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 } + }, + + { + "id": 19, + "type": "timeseries", + "title": "Pod Phases Over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)", + "refId": "A", + "legendFormat": "Running" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)", + "refId": "B", + "legendFormat": "Pending" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)", + "refId": "C", + "legendFormat": "Failed" + }, + { + "expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)", + "refId": "D", + "legendFormat": "Unknown" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 2, + "fillOpacity": 15, + "spanNulls": false, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Running" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Pending" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Failed" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Unknown" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["lastNotNull"] + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08b-dashboard-nodes-health.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08b-dashboard-nodes-health.yaml new file mode 100644 index 0000000..a8cc179 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08b-dashboard-nodes-health.yaml @@ -0,0 +1,637 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-node-health + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Node Health", + "uid": "okd-node-health", + "schemaVersion": 36, + "version": 2, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "node", "health"], + "templating": { + "list": [ + { + "name": "node", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_node_info, node)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Node", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, + "type": "stat", + "title": "Total Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, + "type": "stat", + "title": "Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, + "type": "stat", + "title": "Not Ready Nodes", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, + "type": "stat", + "title": "Memory Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, + "type": "stat", + "title": "Disk Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, + "type": "stat", + "title": "PID Pressure", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, + "type": "stat", + "title": "Unschedulable", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, + "type": "stat", + "title": "Kubelet Up", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, + "type": "table", + "title": "Node Conditions", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})", + "refId": "A", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})", + "refId": "B", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})", + "refId": "C", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})", + "refId": "D", + "legendFormat": "{{node}}", + "instant": true + }, + { + "expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})", + "refId": "E", + "legendFormat": "{{node}}", + "instant": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": { "mode": "columns" } + }, + { + "id": "joinByField", + "options": { "byField": "node", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true + }, + "renameByName": { + "node": "Node", + "Value #A": "Ready", + "Value #B": "Mem Pressure", + "Value #C": "Disk Pressure", + "Value #D": "PID Pressure", + "Value #E": "Unschedulable" + }, + "indexByName": { + "node": 0, + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "displayMode": "color-background", "align": "center" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Node" }, + "properties": [ + { "id": "custom.displayMode", "value": "auto" }, + { "id": "custom.align", "value": "left" }, + { "id": "custom.width", "value": 200 } + ] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✗ Not Ready", "color": "red", "index": 0 }, + "1": { "text": "✓ Ready", "color": "green", "index": 1 } + } + } + ] + } + ] + }, + { + "matcher": { "id": "byRegexp", "options": ".*Pressure" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✓ OK", "color": "green", "index": 0 }, + "1": { "text": "⚠ Active", "color": "red", "index": 1 } + } + } + ] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Unschedulable" }, + "properties": [ + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] } + }, + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "mappings", + "value": [ + { + "type": "value", + "options": { + "0": { "text": "✓ Schedulable", "color": "green", "index": 0 }, + "1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 } + } + } + ] + } + ] + } + ] + }, + "options": { "sortBy": [{ "displayName": "Node", "desc": false }] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, + "type": "timeseries", + "title": "CPU Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 } + }, + + { + "id": 11, + "type": "bargauge", + "title": "CPU Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 } + }, + + { + "id": 12, + "type": "timeseries", + "title": "Memory Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 } + }, + + { + "id": 13, + "type": "bargauge", + "title": "Memory Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 } + }, + + { + "id": 14, + "type": "timeseries", + "title": "Root Disk Usage per Node (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 } + }, + + { + "id": 15, + "type": "bargauge", + "title": "Root Disk Usage \u2014 Current", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 } + }, + + { + "id": 16, + "type": "timeseries", + "title": "Network Traffic per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))", + "refId": "A", + "legendFormat": "rx {{instance}}" + }, + { + "expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))", + "refId": "B", + "legendFormat": "tx {{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 } + }, + + { + "id": 17, + "type": "bargauge", + "title": "Pods per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "count by(node) (kube_pod_info{node=~\"$node\"})", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 200 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 } + }, + + { + "id": 18, + "type": "timeseries", + "title": "System Load Average (1m) per Node", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "node_load1", + "refId": "A", + "legendFormat": "1m \u2014 {{instance}}" + }, + { + "expr": "node_load5", + "refId": "B", + "legendFormat": "5m \u2014 {{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 } + }, + + { + "id": 19, + "type": "bargauge", + "title": "Node Uptime", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "time() - node_boot_time_seconds", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "green", "value": 3600 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": false, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08c-dashboard-workloads-health.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08c-dashboard-workloads-health.yaml new file mode 100644 index 0000000..871a292 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08c-dashboard-workloads-health.yaml @@ -0,0 +1,783 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-workload-health + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Workload Health", + "uid": "okd-workload-health", + "schemaVersion": 36, + "version": 3, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "workload", "health"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Total Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Running Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Pending Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "Failed Pods", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "CrashLoopBackOff", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "OOMKilled", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "Deployments Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Deployments Degraded", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Deployments", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, + "type": "table", + "title": "Deployment Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})", + "refId": "E", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "deployment", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "deployment", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true + }, + "renameByName": { + "namespace": "Namespace", + "deployment": "Deployment", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Available", + "Value 3": "Unavailable", + "Value 4": "Up-to-date" + }, + "indexByName": { + "namespace": 0, + "deployment": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5, + "Value 4": 6 + } + } + }, + { + "id": "sortBy", + "options": { + "fields": [{ "displayName": "Namespace", "desc": false }] + } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Deployment" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }] + }, + { + "matcher": { "id": "byName", "options": "Unavailable" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + ] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { + "id": "thresholds", + "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + } + ] + } + ] + }, + "options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 12, + "type": "table", + "title": "StatefulSet Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "statefulset", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "statefulset", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true + }, + "renameByName": { + "namespace": "Namespace", + "statefulset": "StatefulSet", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Current", + "Value 3": "Up-to-date" + }, + "indexByName": { + "namespace": 0, + "statefulset": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Namespace", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "StatefulSet" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 } + }, + + { + "id": 13, + "type": "table", + "title": "DaemonSet Status", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})", + "refId": "B", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})", + "refId": "C", + "instant": true, + "format": "table", + "legendFormat": "" + }, + { + "expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})", + "refId": "D", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": ["namespace", "daemonset", "Value"] + } + } + }, + { + "id": "joinByField", + "options": { + "byField": "daemonset", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true + }, + "renameByName": { + "namespace": "Namespace", + "daemonset": "DaemonSet", + "Value": "Desired", + "Value 1": "Ready", + "Value 2": "Unavailable", + "Value 3": "Misscheduled" + }, + "indexByName": { + "namespace": 0, + "daemonset": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Namespace", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "DaemonSet" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }] + }, + { + "matcher": { "id": "byName", "options": "Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Unavailable" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Misscheduled" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 } + }, + + { + "id": 14, "type": "row", "title": "Pods", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 15, + "type": "timeseries", + "title": "Pod Phase over Time", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 } + }, + + { + "id": 16, + "type": "piechart", + "title": "Pod Phase — Now", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})", + "refId": "A", "instant": true, "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "pieType": "donut", + "tooltip": { "mode": "single" }, + "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 } + }, + + { + "id": 17, + "type": "timeseries", + "title": "Container Restarts over Time (total counter, top 10)", + "description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)", + "refId": "A", + "legendFormat": "{{namespace}} / {{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 } + }, + + { + "id": 18, + "type": "table", + "title": "Container Total Restarts (non-zero)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0", + "refId": "A", + "instant": true, + "format": "table", + "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { "names": ["namespace", "pod", "container", "Value"] } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "container": "Container", + "Value": "Total Restarts" + }, + "indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] }, + { "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] }, + { "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] }, + { + "matcher": { "id": "byName", "options": "Total Restarts" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 } + }, + + { + "id": 19, "type": "row", "title": "Resource Usage", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 } + }, + + { + "id": 20, + "type": "timeseries", + "title": "CPU Usage by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 } + }, + + { + "id": 21, + "type": "timeseries", + "title": "Memory Usage by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 } + }, + + { + "id": 22, + "type": "bargauge", + "title": "CPU — Actual vs Requested (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 150, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] } + } + }, + "options": { + "orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 } + }, + + { + "id": 23, + "type": "bargauge", + "title": "Memory — Actual vs Requested (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100", + "refId": "A", "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 150, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] } + } + }, + "options": { + "orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true, + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08d-dashboard-networking.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08d-dashboard-networking.yaml new file mode 100644 index 0000000..66f8902 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08d-dashboard-networking.yaml @@ -0,0 +1,955 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-networking + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Networking", + "uid": "okd-networking", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "networking"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Namespace", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Network RX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Network TX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "RX Errors/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "TX Errors/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "RX Drops/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "TX Drops/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] }, + "unit": "pps", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "DNS Queries/s", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_dns_requests_total[5m]))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "reqps", "noValue": "0", "decimals": 1 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "DNS Error %", + "description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "percent", "noValue": "0", "decimals": 2 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Network I/O", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Receive Rate by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 } + }, + + { + "id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))", + "refId": "A", "legendFormat": "{{namespace}} / {{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 } + }, + + { + "id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))", + "refId": "A", "legendFormat": "{{namespace}} / {{pod}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 } + }, + + { + "id": 15, + "type": "table", + "title": "Pod Network I/O Summary", + "description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "B", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "C", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "D", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "E", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "F", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["namespace", "pod", "Value"] } } + }, + { + "id": "joinByField", + "options": { "byField": "pod", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "namespace 5": true + }, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "Value": "RX Rate", + "Value 1": "TX Rate", + "Value 2": "RX Errors/s", + "Value 3": "TX Errors/s", + "Value 4": "RX Drops/s", + "Value 5": "TX Drops/s" + }, + "indexByName": { + "namespace": 0, + "pod": 1, + "Value": 2, + "Value 1": 3, + "Value 2": 4, + "Value 3": 5, + "Value 4": 6, + "Value 5": 7 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "RX Rate", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] + }, + { + "matcher": { "id": "byName", "options": "Pod" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" }, + "properties": [ + { "id": "unit", "value": "Bps" }, + { "id": "custom.displayMode", "value": "color-background-solid" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10000000 }, + { "color": "orange", "value": 100000000 }, + { "color": "red", "value": 500000000 } + ]}} + ] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" }, + "properties": [ + { "id": "unit", "value": "pps" }, + { "id": "decimals", "value": 3 }, + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]}} + ] + }, + { + "matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" }, + "properties": [ + { "id": "unit", "value": "pps" }, + { "id": "decimals", "value": 3 }, + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 0.001 } + ]}} + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 } + }, + + { + "id": 17, "type": "timeseries", "title": "RX Errors by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 } + }, + + { + "id": 18, "type": "timeseries", "title": "TX Errors by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 } + }, + + { + "id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 } + }, + + { + "id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))", + "refId": "A", "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "pps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 } + }, + + { + "id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } + }, + + { + "id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))", + "refId": "A", "legendFormat": "{{type}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 } + }, + + { + "id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode", + "description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))", + "refId": "A", "legendFormat": "{{rcode}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 } + }, + + { + "id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "B", "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "refId": "C", "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 } + }, + + { + "id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)", + "description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100", + "refId": "A", "legendFormat": "Cache Hit %" + }], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 80 } + ]}, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "single" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 } + }, + + { + "id": 26, "type": "timeseries", "title": "DNS Forward Request Rate", + "description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(coredns_forward_requests_total[5m]))", + "refId": "A", "legendFormat": "Forward Requests/s" + }, + { + "expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))", + "refId": "B", "legendFormat": "Forward Responses/s" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 } + }, + + { + "id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 } + }, + + { + "id": 28, "type": "stat", "title": "Total Services", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count(kube_service_info{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 } + }, + + { + "id": 29, "type": "stat", "title": "Endpoint Addresses Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 } + }, + + { + "id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 } + }, + + { + "id": 31, + "type": "table", + "title": "Endpoint Availability", + "description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + }, + { + "expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})", + "refId": "B", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["namespace", "endpoint", "Value"] } } + }, + { + "id": "joinByField", + "options": { "byField": "endpoint", "mode": "outer" } + }, + { + "id": "organize", + "options": { + "excludeByName": { "namespace 1": true }, + "renameByName": { + "namespace": "Namespace", + "endpoint": "Endpoint", + "Value": "Available", + "Value 1": "Not Ready" + }, + "indexByName": { + "namespace": 0, + "endpoint": 1, + "Value": 2, + "Value 1": 3 + } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Not Ready", "desc": true }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "Endpoint" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }] + }, + { + "matcher": { "id": "byName", "options": "Available" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } } + ] + }, + { + "matcher": { "id": "byName", "options": "Not Ready" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } } + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 } + }, + + { + "id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 } + }, + + { + "id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code", + "description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 } + }, + + { + "id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)", + "description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100", + "refId": "A", "legendFormat": "4xx %" + }, + { + "expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100", + "refId": "B", "legendFormat": "5xx %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 } + }, + + { + "id": 35, "type": "timeseries", "title": "Router Bytes In / Out", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))", + "refId": "A", "legendFormat": "Bytes In" + }, + { + "expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))", + "refId": "B", "legendFormat": "Bytes Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 } + }, + + { + "id": 36, + "type": "table", + "title": "Router Backend Server Status", + "description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "haproxy_server_up", + "refId": "A", "instant": true, "format": "table", "legendFormat": "" + } + ], + "transformations": [ + { + "id": "filterFieldsByName", + "options": { "include": { "names": ["proxy", "server", "Value"] } } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "renameByName": { + "proxy": "Backend", + "server": "Server", + "Value": "Status" + }, + "indexByName": { "proxy": 0, "server": 1, "Value": 2 } + } + }, + { + "id": "sortBy", + "options": { "fields": [{ "displayName": "Status", "desc": false }] } + } + ], + "fieldConfig": { + "defaults": { "custom": { "align": "center", "displayMode": "auto" } }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Backend" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byName", "options": "Server" }, + "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }] + }, + { + "matcher": { "id": "byName", "options": "Status" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "mappings", "value": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } }, + { "type": "value", "options": { "1": { "text": "UP", "color": "green" } } } + ]}, + { "id": "thresholds", "value": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}} + ] + } + ] + }, + "options": {}, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08e-dashboard-storage.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08e-dashboard-storage.yaml new file mode 100644 index 0000000..5ae552f --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08e-dashboard-storage.yaml @@ -0,0 +1,607 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: storage-health + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + + json: | + { + "title": "Storage Health", + "uid": "storage-health", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + + { + "type": "row", + "id": 1, + "title": "PVC / PV Status", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 } + }, + + { + "type": "stat", + "id": 2, + "title": "Bound PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 } + }, + + { + "type": "stat", + "id": 3, + "title": "Pending PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 } + }, + + { + "type": "stat", + "id": 4, + "title": "Lost PVCs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 } + }, + + { + "type": "stat", + "id": 5, + "title": "Bound PVs / Available PVs", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A", + "legendFormat": "Bound" + }, + { + "expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)", + "refId": "B", + "legendFormat": "Available" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 } + }, + + { + "type": "stat", + "id": 6, + "title": "Ceph Cluster Health", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_health_status", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 2 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "HEALTH_OK", "index": 0 }, + "1": { "text": "HEALTH_WARN", "index": 1 }, + "2": { "text": "HEALTH_ERR", "index": 2 } + } + } + ] + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "value" + }, + "gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 } + }, + + { + "type": "stat", + "id": 7, + "title": "OSDs Up / Total", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(ceph_osd_up) or vector(0)", + "refId": "A", + "legendFormat": "Up" + }, + { + "expr": "count(ceph_osd_metadata) or vector(0)", + "refId": "B", + "legendFormat": "Total" + } + ], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto" + }, + "gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 } + }, + + { + "type": "row", + "id": 8, + "title": "Cluster Capacity", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 } + }, + + { + "type": "gauge", + "id": 9, + "title": "Ceph Cluster Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 } + }, + + { + "type": "stat", + "id": 10, + "title": "Ceph Capacity — Total / Available", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_cluster_total_bytes", + "refId": "A", + "legendFormat": "Total" + }, + { + "expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)", + "refId": "B", + "legendFormat": "Available" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto", + "orientation": "vertical" + }, + "gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 } + }, + + { + "type": "bargauge", + "id": 11, + "title": "PV Allocated Capacity by Storage Class (Bound)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)", + "refId": "A", + "legendFormat": "{{storageclass}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 } + }, + + { + "type": "piechart", + "id": 12, + "title": "PVC Phase Distribution", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)", + "refId": "A", + "legendFormat": "Bound" + }, + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)", + "refId": "B", + "legendFormat": "Pending" + }, + { + "expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)", + "refId": "C", + "legendFormat": "Lost" + } + ], + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" } } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] }, + "pieType": "pie", + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 } + }, + + { + "type": "row", + "id": 13, + "title": "Ceph Performance", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 } + }, + + { + "type": "timeseries", + "id": 14, + "title": "Ceph Pool IOPS (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "rate(ceph_pool_rd[5m])", + "refId": "A", + "legendFormat": "Read — pool {{pool_id}}" + }, + { + "expr": "rate(ceph_pool_wr[5m])", + "refId": "B", + "legendFormat": "Write — pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 } + }, + + { + "type": "timeseries", + "id": 15, + "title": "Ceph Pool Throughput (Read / Write)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "rate(ceph_pool_rd_bytes[5m])", + "refId": "A", + "legendFormat": "Read — pool {{pool_id}}" + }, + { + "expr": "rate(ceph_pool_wr_bytes[5m])", + "refId": "B", + "legendFormat": "Write — pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 } + }, + + { + "type": "row", + "id": 16, + "title": "Ceph OSD & Pool Details", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 } + }, + + { + "type": "timeseries", + "id": 17, + "title": "Ceph Pool Space Used (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)", + "refId": "A", + "legendFormat": "Pool {{pool_id}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "custom": { "lineWidth": 2, "fillOpacity": 10 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 } + }, + + { + "type": "bargauge", + "id": 18, + "title": "OSD Status per Daemon (green = Up, red = Down)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "ceph_osd_up", + "refId": "A", + "legendFormat": "{{ceph_daemon}}" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 1, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "index": 0 }, + "1": { "text": "UP", "index": 1 } + } + } + ] + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "basic", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 } + }, + + { + "type": "row", + "id": 19, + "title": "Node Disk Usage", + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 } + }, + + { + "type": "timeseries", + "id": 20, + "title": "Node Root Disk Usage Over Time (%)", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "custom": { "lineWidth": 2, "fillOpacity": 10 } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 } + }, + + { + "type": "bargauge", + "id": 21, + "title": "Current Disk Usage — All Nodes & Mountpoints", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)", + "refId": "A", + "legendFormat": "{{instance}} — {{mountpoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08f-dashboard-etcd.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08f-dashboard-etcd.yaml new file mode 100644 index 0000000..6325e47 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08f-dashboard-etcd.yaml @@ -0,0 +1,744 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-etcd + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "etcd", + "uid": "okd-etcd", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "etcd"], + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "Instance", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Cluster Members", + "description": "Total number of etcd members currently reporting metrics.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Has Leader", + "description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ]}, + "unit": "short", "noValue": "0", + "mappings": [ + { "type": "value", "options": { + "0": { "text": "NO LEADER", "color": "red" }, + "1": { "text": "OK", "color": "green" } + }} + ] + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Leader Changes (1h)", + "description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "DB Size (Max)", + "description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 2147483648 }, + { "color": "orange", "value": 5368709120 }, + { "color": "red", "value": 7516192768 } + ]}, + "unit": "bytes", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "DB Fragmentation (Max)", + "description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 25 }, + { "color": "orange", "value": 50 }, + { "color": "red", "value": 75 } + ]}, + "unit": "percent", "noValue": "0", "decimals": 1 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "Failed Proposals/s", + "description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]}, + "unit": "short", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "WAL Fsync p99", + "description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 0.5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Backend Commit p99", + "description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.025 }, + { "color": "orange", "value": 0.1 }, + { "color": "red", "value": 0.25 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Cluster Health", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Has Leader per Instance", + "description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_has_leader{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "max": 1.1, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }, + "mappings": [ + { "type": "value", "options": { + "0": { "text": "0 — no leader" }, + "1": { "text": "1 — ok" } + }} + ] + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": [] } + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)", + "description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "none" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } + }, + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "timeseries", "title": "Slow Operations", + "description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" }, + { "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" }, + { "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 } + }, + + { + "id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method", + "description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))", + "refId": "A", "legendFormat": "{{grpc_method}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 } + }, + + { + "id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code", + "description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))", + "refId": "A", "legendFormat": "{{grpc_code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 } + }, + + { + "id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)", + "description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 } + }, + + { + "id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 } + }, + + { + "id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied", + "description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" }, + { "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 } + }, + + { + "id": 19, "type": "timeseries", "title": "Proposals Pending", + "description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_server_proposals_pending{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line+area" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 10 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 } + }, + + { + "id": 20, "type": "timeseries", "title": "Failed Proposals Rate", + "description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0.001 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 } + }, + + { + "id": 21, "type": "row", "title": "Disk I/O", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 } + }, + + { + "id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance", + "description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" }, + { "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" }, + { "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 } + }, + + { + "id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance", + "description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" }, + { "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" }, + { "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 } + }, + + { + "id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 } + }, + + { + "id": 25, "type": "timeseries", "title": "Peer RX Rate", + "description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 } + }, + + { + "id": 26, "type": "timeseries", "title": "Peer TX Rate", + "description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 } + }, + + { + "id": 27, "type": "timeseries", "title": "Client gRPC Received", + "description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 } + }, + + { + "id": 28, "type": "timeseries", "title": "Client gRPC Sent", + "description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "Bps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 } + }, + + { + "id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 } + }, + + { + "id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance", + "description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" }, + { "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 } + }, + + { + "id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)", + "description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}", + "refId": "A", "legendFormat": "{{instance}}" + }], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 } + }, + + { + "id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit", + "description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" }, + { "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": "^Limit.*" }, + "properties": [ + { "id": "custom.lineWidth", "value": 1 }, + { "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } }, + { "id": "custom.fillOpacity","value": 0 } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 } + }, + + { + "id": 33, "type": "row", "title": "Snapshots", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 } + }, + + { + "id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)", + "description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 } + }, + + { + "id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)", + "description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08g-dashboard-control-plane.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08g-dashboard-control-plane.yaml new file mode 100644 index 0000000..94c826e --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08g-dashboard-control-plane.yaml @@ -0,0 +1,752 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-control-plane-health + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Control Plane Health", + "uid": "okd-control-plane", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "tags": ["okd", "control-plane"], + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "multi": true, + "allValue": ".*", + "label": "API Server Instance", + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "API Servers Up", + "description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Controller Managers Up", + "description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Schedulers Up", + "description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "green", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "API 5xx Rate", + "description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 1 } + ]}, + "unit": "reqps", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "Inflight — Mutating", + "description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 500 }, + { "color": "orange", "value": 750 }, + { "color": "red", "value": 900 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "Inflight — Read-Only", + "description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1500 }, + { "color": "orange", "value": 2200 }, + { "color": "red", "value": 2700 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)", + "description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 3 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "APIServer → etcd p99", + "description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))", + "refId": "A", "legendFormat": "" + }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.05 }, + { "color": "orange", "value": 0.2 }, + { "color": "red", "value": 0.5 } + ]}, + "unit": "s", "noValue": "0", "decimals": 4 + } + }, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Request Rate by Verb", + "description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code", + "description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))", + "refId": "A", "legendFormat": "HTTP {{code}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only", + "description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)", + "description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 } + }, + + { + "id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb", + "description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))", + "refId": "A", "legendFormat": "{{verb}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 } + }, + + { + "id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation", + "description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" }, + { "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 } + }, + + { + "id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 } + }, + + { + "id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource", + "description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})", + "refId": "A", "legendFormat": "{{resource}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 } + }, + + { + "id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind", + "description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{kind}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 } + }, + + { + "id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind", + "description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" }, + { "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 } + }, + + { + "id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 } + }, + + { + "id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name", + "description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{type}} — {{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 } + }, + + { + "id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name", + "description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "red", "value": 2.0 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 } + }, + + { + "id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name", + "description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))", + "refId": "A", "legendFormat": "{{name}} ({{error_type}})" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 } + }, + + { + "id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 } + }, + + { + "id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller", + "description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 } + }, + + { + "id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller", + "description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 } + }, + + { + "id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller", + "description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))", + "refId": "A", "legendFormat": "{{name}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 } + }, + + { + "id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 } + }, + + { + "id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result", + "description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))", + "refId": "A", "legendFormat": "{{result}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 } + }, + + { + "id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99", + "description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" } + ], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 } + }, + + { + "id": 32, "type": "timeseries", "title": "Pending Pods by Queue", + "description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(queue)(scheduler_pending_pods)", + "refId": "A", "legendFormat": "{{queue}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "red", "value": 50 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 } + }, + + { + "id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 } + }, + + { + "id": 34, "type": "timeseries", "title": "CPU Usage by Component", + "description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 } + }, + + { + "id": 35, "type": "timeseries", "title": "RSS Memory by Component", + "description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 } + }, + + { + "id": 36, "type": "timeseries", "title": "Goroutines by Component", + "description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" }, + { "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" }, + { "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08h-dashboard-alerts-events-problems.yaml b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08h-dashboard-alerts-events-problems.yaml new file mode 100644 index 0000000..5c9bee4 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/dev_test_yamls/08h-dashboard-alerts-events-problems.yaml @@ -0,0 +1,741 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: okd-alerts-events + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + json: | + { + "title": "Alerts & Events — Active Problems", + "uid": "okd-alerts-events", + "schemaVersion": 36, + "version": 1, + "refresh": "30s", + "time": { "from": "now-3h", "to": "now" }, + "tags": ["okd", "alerts", "events"], + "templating": { + "list": [ + { + "name": "severity", + "type": "custom", + "label": "Severity Filter", + "query": "critical,warning,info", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "includeAll": true, + "allValue": "critical|warning|info", + "multi": false, + "options": [ + { "selected": true, "text": "All", "value": "$__all" }, + { "selected": false, "text": "Critical", "value": "critical" }, + { "selected": false, "text": "Warning", "value": "warning" }, + { "selected": false, "text": "Info", "value": "info" } + ] + }, + { + "name": "namespace", + "type": "query", + "label": "Namespace", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" }, + "refresh": 2, + "includeAll": true, + "allValue": ".*", + "multi": true, + "sort": 1, + "current": {}, + "options": [] + } + ] + }, + "panels": [ + + { + "id": 1, "type": "stat", "title": "Critical Alerts Firing", + "description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 } + }, + + { + "id": 2, "type": "stat", "title": "Warning Alerts Firing", + "description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "orange", "value": 5 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 } + }, + + { + "id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing", + "description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "blue", "value": 1 }, + { "color": "blue", "value": 25 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 } + }, + + { + "id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)", + "description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 20 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 } + }, + + { + "id": 5, "type": "stat", "title": "CrashLoopBackOff Pods", + "description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 } + }, + + { + "id": 6, "type": "stat", "title": "OOMKilled Containers", + "description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 } + }, + + { + "id": 7, "type": "stat", "title": "NotReady Nodes", + "description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 } + }, + + { + "id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)", + "description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }], + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]}, + "unit": "short", "noValue": "0" + } + }, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" }, + "gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 } + }, + + { + "id": 9, "type": "row", "title": "Alert Overview", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 } + }, + + { + "id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time", + "description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})", + "refId": "A", + "legendFormat": "{{severity}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 } + }, + + { + "id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration", + "description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" }, + { "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 3, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + }, + "overrides": [ + { + "matcher": { "id": "byFrameRefID", "options": "B" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }, + { "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } }, + { "id": "custom.lineWidth", "value": 1 } + ] + } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 } + }, + + { + "id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts", + "description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})", + "refId": "A", + "legendFormat": "{{alertname}} · {{severity}} · {{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "s", "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "orange", "value": 1800 }, + { "color": "red", "value": 7200 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true, + "valueMode": "color" + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 } + }, + + { + "id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 } + }, + + { + "id": 14, "type": "table", "title": "All Firing Alerts", + "description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}", + "refId": "A", + "instant": true, + "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "alertstate": true, + "__name__": true, + "Value": true, + "Time": true + }, + "renameByName": { + "alertname": "Alert Name", + "severity": "Severity", + "namespace": "Namespace", + "pod": "Pod", + "node": "Node", + "container": "Container", + "job": "Job", + "service": "Service", + "reason": "Reason", + "instance": "Instance" + }, + "indexByName": { + "severity": 0, + "alertname": 1, + "namespace": 2, + "pod": 3, + "node": 4, + "container": 5, + "job": 6, + "service": 7, + "reason": 8, + "instance": 9 + } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Severity" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 110 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 }, + "warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 }, + "info": { "text": "INFO", "color": "dark-blue", "index": 2 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] }, + { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] }, + { "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] }, + { "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Severity" }], + "footer": { "show": false } + }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 } + }, + + { + "id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 } + }, + + { + "id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason", + "description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))", + "refId": "A", + "legendFormat": "{{reason}}" + }], + "fieldConfig": { + "defaults": { + "unit": "reqps", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 } + }, + + { + "id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)", + "description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 10 }, + { "color": "orange", "value": 50 }, + { "color": "red", "value": 200 } + ]} + } + }, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] }, + "displayMode": "gradient", + "showUnfilled": true + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 } + }, + + { + "id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time", + "description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))", + "refId": "A", + "legendFormat": "{{reason}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 } + }, + + { + "id": 19, "type": "row", "title": "Pod Problems", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 } + }, + + { + "id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace", + "description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ]} + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 } + }, + + { + "id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace", + "description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))", + "refId": "A", + "legendFormat": "{{namespace}}" + }], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, "decimals": 4, + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false } + } + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 } + }, + + { + "id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)", + "description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" }, + { "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" } + ], + "fieldConfig": { + "defaults": { + "unit": "short", "min": 0, + "color": { "mode": "palette-classic" }, + "custom": { + "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false, + "thresholdsStyle": { "mode": "line" } + }, + "thresholds": { "mode": "absolute", "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ]} + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "options": { + "tooltip": { "mode": "multi", "sort": "desc" }, + "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] } + }, + "gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 } + }, + + { + "id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 } + }, + + { + "id": 24, "type": "table", "title": "Node Condition Status Matrix", + "description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [{ + "expr": "kube_node_status_condition == 1", + "refId": "A", + "instant": true, + "legendFormat": "" + }], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true + }, + "renameByName": { + "node": "Node", + "condition": "Condition", + "status": "Status" + }, + "indexByName": { "node": 0, "condition": 1, "status": 2 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Status" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 90 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "true": { "text": "true", "color": "green", "index": 0 }, + "false": { "text": "false", "color": "dark-red", "index": 1 }, + "unknown": { "text": "unknown", "color": "dark-orange", "index": 2 } + } + }] + } + ] + }, + { + "matcher": { "id": "byName", "options": "Condition" }, + "properties": [ + { "id": "custom.width", "value": 190 }, + { "id": "custom.displayMode", "value": "color-text" }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "Ready": { "color": "green", "index": 0 }, + "MemoryPressure": { "color": "red", "index": 1 }, + "DiskPressure": { "color": "red", "index": 2 }, + "PIDPressure": { "color": "red", "index": 3 }, + "NetworkUnavailable": { "color": "red", "index": 4 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Node" }], + "footer": { "show": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 } + }, + + { + "id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)", + "description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.", + "datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" }, + "targets": [ + { + "expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1", + "refId": "A", + "instant": true, + "legendFormat": "" + }, + { + "expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1", + "refId": "B", + "instant": true, + "legendFormat": "" + } + ], + "transformations": [ + { "id": "labelsToFields", "options": { "mode": "columns" } }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "endpoint": true, + "job": true, + "service": true, + "instance": true, + "namespace": true + }, + "renameByName": { + "name": "Operator", + "condition": "Condition", + "reason": "Reason" + }, + "indexByName": { "name": 0, "condition": 1, "reason": 2 } + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { "align": "left", "filterable": true }, + "noValue": "—" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Condition" }, + "properties": [ + { "id": "custom.displayMode", "value": "color-background" }, + { "id": "custom.width", "value": 140 }, + { + "id": "mappings", + "value": [{ + "type": "value", + "options": { + "Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 }, + "Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 } + } + }] + } + ] + }, + { "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] }, + { "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] } + ] + }, + "options": { + "sortBy": [{ "desc": false, "displayName": "Condition" }], + "footer": { "show": false } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 } + } + + ] + } diff --git a/harmony/src/modules/monitoring/cluster_dashboards/mod.rs b/harmony/src/modules/monitoring/cluster_dashboards/mod.rs new file mode 100644 index 0000000..ec14778 --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/mod.rs @@ -0,0 +1,2 @@ +mod score; +pub use score::ClusterDashboardsScore; diff --git a/harmony/src/modules/monitoring/cluster_dashboards/score.rs b/harmony/src/modules/monitoring/cluster_dashboards/score.rs new file mode 100644 index 0000000..22f916d --- /dev/null +++ b/harmony/src/modules/monitoring/cluster_dashboards/score.rs @@ -0,0 +1,557 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use k8s_openapi::api::core::v1::{Namespace, Secret}; +use kube::api::ObjectMeta; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +use harmony_k8s::KubernetesDistribution; +use log::debug; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::k8s::resource::K8sResourceScore, + modules::monitoring::kube_prometheus::crd::crd_grafana::{ + Grafana, GrafanaContainer, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, + GrafanaDatasourceConfig, GrafanaDatasourceJsonData, GrafanaDatasourceSecureJsonData, + GrafanaDatasourceSpec, GrafanaDeployment, GrafanaDeploymentSpec, GrafanaIngress, + GrafanaIngressBackend, GrafanaIngressBackendService, GrafanaIngressPath, + GrafanaIngressRule, GrafanaIngressRuleHttp, GrafanaIngressServicePort, GrafanaIngressSpec, + GrafanaPodSpec, GrafanaPodTemplate, GrafanaRoute, GrafanaRoutePort, GrafanaRouteSpec, + GrafanaRouteTarget, GrafanaRouteTls, GrafanaSecretKeyRef, GrafanaSpec, GrafanaValueFrom, + GrafanaValueSource, ResourceRequirements, + }, + modules::monitoring::kube_prometheus::crd::crd_prometheuses::LabelSelector, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct ClusterDashboardsScore { + pub namespace: String, + pub grafana_admin_user: String, + pub grafana_admin_password: String, +} + +impl Default for ClusterDashboardsScore { + fn default() -> Self { + Self { + namespace: "harmony-observability".to_string(), + grafana_admin_user: "admin".to_string(), + grafana_admin_password: "password".to_string(), + } + } +} + +impl ClusterDashboardsScore { + pub fn new(namespace: &str) -> Self { + Self { + namespace: namespace.to_string(), + grafana_admin_user: "admin".to_string(), + grafana_admin_password: "password".to_string(), + } + } + + pub fn with_credentials(namespace: &str, admin_user: &str, admin_password: &str) -> Self { + Self { + namespace: namespace.to_string(), + grafana_admin_user: admin_user.to_string(), + grafana_admin_password: admin_password.to_string(), + } + } +} + +impl Score for ClusterDashboardsScore { + fn name(&self) -> String { + format!("ClusterDashboardsScore({})", self.namespace) + } + + #[doc(hidden)] + fn create_interpret(&self) -> Box> { + Box::new(ClusterDashboardsInterpret { + namespace: self.namespace.clone(), + grafana_admin_user: self.grafana_admin_user.clone(), + grafana_admin_password: self.grafana_admin_password.clone(), + }) + } +} + +#[derive(Debug, Clone)] +pub struct ClusterDashboardsInterpret { + namespace: String, + grafana_admin_user: String, + grafana_admin_password: String, +} + +#[async_trait] +impl Interpret for ClusterDashboardsInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + self.create_namespace(inventory, topology).await?; + self.create_rbac_resources(inventory, topology).await?; + self.create_secret(inventory, topology).await?; + self.create_grafana(inventory, topology).await?; + self.create_datasource(inventory, topology).await?; + self.create_dashboards(inventory, topology).await?; + + Ok(Outcome::success(format!( + "Cluster dashboards resources in namespace '{}' with {} dashboards successfully created", + self.namespace, 8 + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("ClusterDashboards") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl ClusterDashboardsInterpret { + async fn create_namespace( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let mut labels = BTreeMap::new(); + labels.insert( + "openshift.io/cluster-monitoring".to_string(), + "true".to_string(), + ); + + let namespace = Namespace { + metadata: ObjectMeta { + name: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + ..Namespace::default() + }; + + K8sResourceScore::single(namespace, None) + .interpret(inventory, topology) + .await?; + + Ok(()) + } + + async fn create_rbac_resources( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let service_account_name = "grafana-prometheus-datasource-sa".to_string(); + let rbac_namespace = self.namespace.clone(); + + let service_account = { + use k8s_openapi::api::core::v1::ServiceAccount; + ServiceAccount { + metadata: ObjectMeta { + name: Some(service_account_name.clone()), + namespace: Some(rbac_namespace.clone()), + ..ObjectMeta::default() + }, + ..ServiceAccount::default() + } + }; + + let cluster_role = { + use k8s_openapi::api::rbac::v1::{ClusterRole, PolicyRule}; + ClusterRole { + metadata: ObjectMeta { + name: Some("grafana-prometheus-api-access".to_string()), + ..ObjectMeta::default() + }, + rules: Some(vec![PolicyRule { + api_groups: Some(vec!["monitoring.coreos.com".to_string()]), + resources: Some(vec!["prometheuses/api".to_string()]), + verbs: vec!["get".to_string()], + ..PolicyRule::default() + }]), + ..ClusterRole::default() + } + }; + + let cluster_role_binding = { + use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject}; + ClusterRoleBinding { + metadata: ObjectMeta { + name: Some("grafana-prometheus-api-access-binding".to_string()), + ..ObjectMeta::default() + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".to_string(), + name: service_account_name.clone(), + namespace: Some(rbac_namespace.clone()), + ..Subject::default() + }]), + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".to_string(), + kind: "ClusterRole".to_string(), + name: "grafana-prometheus-api-access".to_string(), + }, + } + }; + + let cluster_role_binding_cluster_monitoring = { + use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject}; + ClusterRoleBinding { + metadata: ObjectMeta { + name: Some("grafana-cluster-monitoring-view".to_string()), + ..ObjectMeta::default() + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".to_string(), + name: service_account_name.clone(), + namespace: Some(rbac_namespace.clone()), + ..Subject::default() + }]), + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".to_string(), + kind: "ClusterRole".to_string(), + name: "cluster-monitoring-view".to_string(), + }, + } + }; + + K8sResourceScore::single(service_account, Some(rbac_namespace.clone())) + .interpret(inventory, topology) + .await?; + K8sResourceScore::single(cluster_role, None) + .interpret(inventory, topology) + .await?; + K8sResourceScore::single(cluster_role_binding, None) + .interpret(inventory, topology) + .await?; + K8sResourceScore::single(cluster_role_binding_cluster_monitoring, None) + .interpret(inventory, topology) + .await?; + + Ok(()) + } + + async fn create_secret( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let service_account_name = "grafana-prometheus-datasource-sa".to_string(); + let secret_name = "grafana-prometheus-token".to_string(); + let secret_namespace = self.namespace.clone(); + + let secret = Secret { + metadata: ObjectMeta { + name: Some(secret_name), + namespace: Some(secret_namespace), + annotations: Some({ + let mut ann = BTreeMap::new(); + ann.insert( + "kubernetes.io/service-account.name".to_string(), + service_account_name, + ); + ann + }), + ..ObjectMeta::default() + }, + type_: Some("kubernetes.io/service-account-token".to_string()), + ..Secret::default() + }; + + K8sResourceScore::single(secret, Some(self.namespace.clone())) + .interpret(inventory, topology) + .await?; + + Ok(()) + } + + async fn create_grafana( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let labels: BTreeMap = + [("dashboards".to_string(), "grafana".to_string())].into(); + + let mut config: BTreeMap> = BTreeMap::new(); + config.insert("log".into(), [("mode".into(), "console".into())].into()); + config.insert( + "security".into(), + [ + ("admin_user".into(), self.grafana_admin_user.clone()), + ("admin_password".into(), self.grafana_admin_password.clone()), + ] + .into(), + ); + config.insert( + "users".into(), + [("viewers_can_edit".into(), "false".into())].into(), + ); + config.insert( + "auth".into(), + [("disable_login_form".into(), "false".into())].into(), + ); + config.insert( + "auth.anonymous".into(), + [ + ("enabled".into(), "true".into()), + ("org_role".into(), "Viewer".into()), + ] + .into(), + ); + + let resources = ResourceRequirements { + requests: [ + ("cpu".into(), "500m".into()), + ("memory".into(), "1Gi".into()), + ] + .into(), + limits: [("cpu".into(), "1".into()), ("memory".into(), "2Gi".into())].into(), + }; + + let client = topology + .k8s_client() + .await + .map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?; + let distribution = client + .get_k8s_distribution() + .await + .map_err(|e| InterpretError::new(format!("Failed to detect k8s distribution: {e}")))?; + + // OpenShift → Route (operator-managed); plain k8s → Ingress (operator-managed). + let (route, ingress) = if matches!(distribution, KubernetesDistribution::OpenshiftFamily) { + debug!("OpenShift detected; Grafana CR will use .spec.route"); + let route = GrafanaRoute { + spec: Some(GrafanaRouteSpec { + port: Some(GrafanaRoutePort { target_port: 3000 }), + tls: Some(GrafanaRouteTls { + termination: Some("edge".to_string()), + insecure_edge_termination_policy: Some("Redirect".to_string()), + }), + to: Some(GrafanaRouteTarget { + kind: "Service".to_string(), + name: "cluster-grafana-service".to_string(), + weight: Some(100), + }), + }), + }; + (Some(route), None) + } else { + let hostname = client + .get_domain("cluster-grafana") + .await + .map_err(|e| InterpretError::new(format!("Failed to resolve domain: {e}")))?; + debug!("Non-OpenShift detected; Grafana CR will use .spec.ingress (host: {hostname})"); + let ingress = GrafanaIngress { + spec: Some(GrafanaIngressSpec { + ingress_class_name: None, + rules: Some(vec![GrafanaIngressRule { + host: Some(hostname), + http: Some(GrafanaIngressRuleHttp { + paths: vec![GrafanaIngressPath { + path: "/".to_string(), + path_type: "Prefix".to_string(), + backend: GrafanaIngressBackend { + service: GrafanaIngressBackendService { + name: "cluster-grafana-service".to_string(), + port: GrafanaIngressServicePort { number: 3000 }, + }, + }, + }], + }), + }]), + }), + }; + (None, Some(ingress)) + }; + + let grafana = Grafana { + metadata: ObjectMeta { + name: Some("cluster-grafana".to_string()), + namespace: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + spec: GrafanaSpec { + config: Some(config), + deployment: Some(GrafanaDeployment { + spec: Some(GrafanaDeploymentSpec { + replicas: Some(1), + template: Some(GrafanaPodTemplate { + spec: Some(GrafanaPodSpec { + containers: vec![GrafanaContainer { + name: "grafana".to_string(), + resources: Some(resources), + }], + }), + }), + }), + }), + route, + ingress, + }, + }; + + K8sResourceScore::single(grafana, Some(self.namespace.clone())) + .interpret(inventory, topology) + .await?; + + Ok(()) + } + + async fn create_datasource( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let labels: BTreeMap = + [("datasource".to_string(), "prometheus".to_string())].into(); + + let instance_selector = LabelSelector { + match_labels: [("dashboards".to_string(), "grafana".to_string())].into(), + match_expressions: vec![], + }; + + let datasource = GrafanaDatasource { + metadata: ObjectMeta { + name: Some("prometheus-cluster".to_string()), + namespace: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + spec: GrafanaDatasourceSpec { + instance_selector, + allow_cross_namespace_import: None, + datasource: GrafanaDatasourceConfig { + name: "Prometheus-Cluster".to_string(), + r#type: "prometheus".to_string(), + access: "proxy".to_string(), + url: "https://prometheus-k8s.openshift-monitoring.svc:9091".to_string(), + database: None, + is_default: Some(true), + editable: None, + json_data: Some(GrafanaDatasourceJsonData { + http_header_name1: Some("Authorization".to_string()), + tls_skip_verify: Some(true), + time_interval: Some("30s".to_string()), + oauth_pass_thru: None, + }), + secure_json_data: Some(GrafanaDatasourceSecureJsonData { + // Placeholder; real value comes from `values_from` at + // reconcile time (see below). + http_header_value1: Some("Bearer ${token}".to_string()), + }), + }, + values_from: Some(vec![GrafanaValueFrom { + target_path: "secureJsonData.httpHeaderValue1".to_string(), + value_from: GrafanaValueSource { + secret_key_ref: GrafanaSecretKeyRef { + name: "grafana-prometheus-token".to_string(), + key: "token".to_string(), + }, + }, + }]), + }, + }; + + K8sResourceScore::single(datasource, Some(self.namespace.clone())) + .interpret(inventory, topology) + .await?; + + Ok(()) + } + + async fn create_dashboards( + &self, + inventory: &Inventory, + topology: &(impl Topology + K8sclient), + ) -> Result<(), InterpretError> { + let dashboards: &[(&str, &str)] = &[ + ( + "okd-cluster-overview", + include_str!("dashboards/cluster-overview.json"), + ), + ( + "okd-node-health", + include_str!("dashboards/nodes-health.json"), + ), + ( + "okd-workload-health", + include_str!("dashboards/workloads-health.json"), + ), + ("okd-networking", include_str!("dashboards/networking.json")), + ("storage-health", include_str!("dashboards/storage.json")), + ("okd-etcd", include_str!("dashboards/etcd.json")), + ( + "okd-control-plane", + include_str!("dashboards/control-plane.json"), + ), + ( + "okd-alerts-events", + include_str!("dashboards/alerts-events-problems.json"), + ), + ]; + + for (dashboard_name, json_content) in dashboards { + let labels: BTreeMap = + [("dashboard".to_string(), dashboard_name.to_string())].into(); + + let instance_selector = LabelSelector { + match_labels: [("dashboards".to_string(), "grafana".to_string())].into(), + match_expressions: vec![], + }; + + let dashboard = GrafanaDashboard { + metadata: ObjectMeta { + name: Some(dashboard_name.to_string()), + namespace: Some(self.namespace.clone()), + labels: Some(labels), + ..ObjectMeta::default() + }, + spec: GrafanaDashboardSpec { + instance_selector, + json: Some(json_content.to_string()), + resync_period: None, + datasources: None, + grafana_com: None, + }, + }; + + K8sResourceScore::single(dashboard, Some(self.namespace.clone())) + .interpret(inventory, topology) + .await?; + } + + Ok(()) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("ClusterDashboards") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs index c9ccacb..4c26851 100644 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs @@ -1,10 +1,29 @@ +use async_trait::async_trait; +use harmony_k8s::KubernetesDistribution; use harmony_macros::hurl; +use harmony_types::id::Id; +use k8s_openapi::api::rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, RoleRef, Subject}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use log::debug; use non_blank_string_rs::NonBlankString; +use serde::Serialize; use std::{collections::HashMap, str::FromStr}; -use crate::modules::helm::chart::{HelmChartScore, HelmRepository}; +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::helm::chart::{HelmChartScore, HelmRepository}, + modules::k8s::resource::K8sResourceScore, + score::Score, + topology::{HelmCommand, K8sclient, Topology}, +}; -pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartScore { +pub fn grafana_helm_chart_score( + ns: &str, + namespace_scope: bool, + chart_version: Option<&str>, +) -> HelmChartScore { let mut values_overrides = HashMap::new(); values_overrides.insert( NonBlankString::from_str("namespaceScope").unwrap(), @@ -14,7 +33,7 @@ pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartSco namespace: Some(NonBlankString::from_str(ns).unwrap()), release_name: NonBlankString::from_str("grafana-operator").unwrap(), chart_name: NonBlankString::from_str("grafana/grafana-operator").unwrap(), - chart_version: None, + chart_version: chart_version.map(|v| NonBlankString::from_str(v).unwrap()), values_overrides: Some(values_overrides), values_yaml: None, create_namespace: true, @@ -26,3 +45,173 @@ pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartSco )), } } + +/// Cluster-scoped RBAC so grafana-operator can watch `route.openshift.io/v1.Route`. +/// The upstream chart's ClusterRole doesn't include these verbs and the chart +/// exposes no values key to extend it, so we apply them separately. +/// +/// Safe on non-OpenShift clusters: Kubernetes accepts a `ClusterRole` +/// referencing a missing API group — the rule is simply never matched — but +/// `GrafanaOperatorScore` only applies these on detected OpenShift clusters. +pub fn grafana_operator_openshift_route_rbac_scores( + ns: &str, +) -> ( + K8sResourceScore, + K8sResourceScore, +) { + let cluster_role_name = "harmony-grafana-operator-openshift-routes".to_string(); + let cluster_role_binding_name = "harmony-grafana-operator-openshift-routes-binding".to_string(); + let operator_sa_name = "grafana-operator".to_string(); + + let cluster_role = ClusterRole { + metadata: ObjectMeta { + name: Some(cluster_role_name.clone()), + ..ObjectMeta::default() + }, + rules: Some(vec![PolicyRule { + api_groups: Some(vec!["route.openshift.io".to_string()]), + resources: Some(vec!["routes".to_string(), "routes/custom-host".to_string()]), + verbs: vec![ + "get".to_string(), + "list".to_string(), + "watch".to_string(), + "create".to_string(), + "update".to_string(), + "patch".to_string(), + "delete".to_string(), + ], + ..PolicyRule::default() + }]), + ..ClusterRole::default() + }; + + let cluster_role_binding = ClusterRoleBinding { + metadata: ObjectMeta { + name: Some(cluster_role_binding_name), + ..ObjectMeta::default() + }, + subjects: Some(vec![Subject { + kind: "ServiceAccount".to_string(), + name: operator_sa_name, + namespace: Some(ns.to_string()), + ..Subject::default() + }]), + role_ref: RoleRef { + api_group: "rbac.authorization.k8s.io".to_string(), + kind: "ClusterRole".to_string(), + name: cluster_role_name, + }, + }; + + ( + K8sResourceScore::single(cluster_role, None), + K8sResourceScore::single(cluster_role_binding, None), + ) +} + +/// Composite score: installs grafana-operator via Helm, and on OpenShift-family +/// clusters also applies the `route.openshift.io` RBAC the operator needs to +/// reconcile Routes. Distribution is detected at interpret time via the +/// cluster's API discovery — no flag needed at call time. +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaOperatorScore { + pub namespace: String, + pub namespace_scope: bool, + pub chart_version: Option, +} + +impl GrafanaOperatorScore { + pub fn new(namespace: &str, chart_version: Option<&str>) -> Self { + Self { + namespace: namespace.to_string(), + namespace_scope: false, + chart_version: chart_version.map(|v| v.to_string()), + } + } +} + +impl Score for GrafanaOperatorScore { + fn create_interpret(&self) -> Box> { + Box::new(GrafanaOperatorInterpret { + namespace: self.namespace.clone(), + namespace_scope: self.namespace_scope, + chart_version: self.chart_version.clone(), + }) + } + + fn name(&self) -> String { + format!("GrafanaOperatorScore({})", self.namespace) + } +} + +#[derive(Debug, Clone)] +struct GrafanaOperatorInterpret { + namespace: String, + namespace_scope: bool, + chart_version: Option, +} + +#[async_trait] +impl Interpret for GrafanaOperatorInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology + .k8s_client() + .await + .map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?; + + let distribution = client + .get_k8s_distribution() + .await + .map_err(|e| InterpretError::new(format!("Failed to detect k8s distribution: {e}")))?; + + if matches!(distribution, KubernetesDistribution::OpenshiftFamily) { + debug!( + "OpenShift detected; applying grafana-operator Route RBAC in namespace {}", + self.namespace + ); + let (cr, crb) = grafana_operator_openshift_route_rbac_scores(&self.namespace); + cr.create_interpret().execute(inventory, topology).await?; + crb.create_interpret().execute(inventory, topology).await?; + } else { + debug!( + "Non-OpenShift distribution ({:?}); skipping Route RBAC", + distribution + ); + } + + let helm_score = grafana_helm_chart_score( + &self.namespace, + self.namespace_scope, + self.chart_version.as_deref(), + ); + helm_score + .create_interpret() + .execute(inventory, topology) + .await?; + + Ok(Outcome::success(format!( + "grafana-operator installed in namespace '{}' (distribution: {:?})", + self.namespace, distribution + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("GrafanaOperator") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs index 386890e..1b197ce 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs @@ -6,7 +6,13 @@ use serde::{Deserialize, Serialize}; use super::crd_prometheuses::LabelSelector; -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +/// `Grafana` CR binding — audited against +/// `grafanas.grafana.integreatly.org/v1beta1` on grafana-operator v5.22. +/// Only the fields actively consumed by harmony callers are modeled. +/// `.spec.config` is `map[string]map[string]string` upstream (grafana.ini +/// sections); it is modeled as a nested `BTreeMap` rather than a struct to +/// avoid losing sections like `auth.anonymous` (dotted keys). +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[kube( group = "grafana.integreatly.org", version = "v1beta1", @@ -16,81 +22,177 @@ use super::crd_prometheuses::LabelSelector; )] #[serde(rename_all = "camelCase")] pub struct GrafanaSpec { + /// `grafana.ini` content. Outer map key = section name (e.g. `security`, + /// `auth.anonymous`); inner map = key/value pairs in that section. #[serde(default, skip_serializing_if = "Option::is_none")] - pub config: Option, + pub config: Option>>, #[serde(default, skip_serializing_if = "Option::is_none")] - pub admin_user: Option, + pub deployment: Option, + /// OpenShift-only: reconciled by grafana-operator when the + /// `route.openshift.io` CRD is present. #[serde(default, skip_serializing_if = "Option::is_none")] - pub admin_password: Option, + pub route: Option, + /// Standard k8s Ingress: reconciled by grafana-operator on non-OpenShift + /// clusters. Mutually exclusive with `route` in practice. #[serde(default, skip_serializing_if = "Option::is_none")] pub ingress: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDeployment { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub spec: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaDeploymentSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replicas: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub persistence: Option, + pub template: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaPodTemplate { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub spec: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaPodSpec { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub containers: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaContainer { + pub name: String, #[serde(default, skip_serializing_if = "Option::is_none")] pub resources: Option, } -#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct GrafanaConfig { +pub struct GrafanaRoute { #[serde(default, skip_serializing_if = "Option::is_none")] - pub log: Option, + pub spec: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaRouteSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub port: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub security: Option, + pub tls: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub to: Option, } #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct GrafanaLogConfig { - #[serde(default, skip_serializing_if = "Option::is_none")] - pub mode: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub level: Option, +pub struct GrafanaRoutePort { + /// Upstream schema is int-or-string; we only use integer. + pub target_port: i32, } -#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct GrafanaSecurityConfig { +pub struct GrafanaRouteTls { #[serde(default, skip_serializing_if = "Option::is_none")] - pub admin_user: Option, + pub termination: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub admin_password: Option, + pub insecure_edge_termination_policy: Option, } -#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaRouteTarget { + pub kind: String, + pub name: String, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub weight: Option, +} + +// ---- Ingress types (mirrors standard k8s IngressSpec, narrow subset) ---- + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct GrafanaIngress { #[serde(default, skip_serializing_if = "Option::is_none")] - pub enabled: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub hosts: Option>, + pub spec: Option, } -#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct GrafanaPersistence { +pub struct GrafanaIngressSpec { #[serde(default, skip_serializing_if = "Option::is_none")] - pub enabled: Option, + pub ingress_class_name: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub storage_class_name: Option, + pub rules: Option>, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressRule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub host: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub size: Option, + pub http: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressRuleHttp { + pub paths: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressPath { + pub path: String, + pub path_type: String, + pub backend: GrafanaIngressBackend, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressBackend { + pub service: GrafanaIngressBackendService, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressBackendService { + pub name: String, + pub port: GrafanaIngressServicePort, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct GrafanaIngressServicePort { + pub number: i32, } // ------------------------------------------------------------------------------------------------ -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[kube( group = "grafana.integreatly.org", version = "v1beta1", @@ -135,7 +237,7 @@ pub struct GrafanaCom { // ------------------------------------------------------------------------------------------------ -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[kube( group = "grafana.integreatly.org", version = "v1beta1", @@ -176,7 +278,7 @@ pub struct GrafanaSecretKeyRef { pub key: String, } -#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct GrafanaDatasourceConfig { pub access: String, @@ -235,3 +337,23 @@ pub struct ResourceRequirements { #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] pub requests: BTreeMap, } + +// `Default` impls on the `CustomResource`-generated wrappers so they satisfy +// the `K: Default` bound on `K8sResourceScore`. +impl Default for Grafana { + fn default() -> Self { + Grafana::new("", GrafanaSpec::default()) + } +} + +impl Default for GrafanaDashboard { + fn default() -> Self { + GrafanaDashboard::new("", GrafanaDashboardSpec::default()) + } +} + +impl Default for GrafanaDatasource { + fn default() -> Self { + GrafanaDatasource::new("", GrafanaDatasourceSpec::default()) + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs index 65efab9..8020950 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs @@ -1,3 +1,34 @@ +//! ⚠️ **STALE DUPLICATE — DO NOT COPY FROM** +//! +//! This file is a near-identical duplicate of `crd_grafana.rs` from before +//! that file was audited against the upstream +//! `grafanas.grafana.integreatly.org/v1beta1` schema (grafana-operator +//! v5.22). Fields defined below are known to be **wrong** relative to +//! upstream, in particular: +//! +//! - `GrafanaSpec.admin_user` / `admin_password` — do not exist at +//! `.spec` top-level upstream; the real location is +//! `.spec.config.security.admin_user/admin_password`. +//! - `GrafanaSpec.persistence` — upstream key is `persistentVolumeClaim`, +//! so writes here are silently dropped. +//! - `GrafanaSpec.resources` — there is no `.spec.resources` upstream at +//! all (container resources belong under +//! `.spec.deployment.spec.template.spec.containers[].resources`). +//! - `GrafanaSpec.ingress` — upstream `ingress` is `{ metadata, spec }`, +//! not `{ enabled, hosts }` as modeled here. +//! - `GrafanaConfig` as a typed struct — upstream `.spec.config` is +//! `map[string]map[string]string` (grafana.ini sections). The struct +//! form here cannot express sections like `auth.anonymous` (dotted +//! keys) and loses anything beyond `log`/`security`. +//! +//! This file is kept only because `rhob_alerting_score.rs` still builds +//! against it, and that caller happens to construct `GrafanaSpec` with +//! every field set to `None` — so the bugs are latent, not active. +//! +//! If you need a correct binding, use `crd_grafana.rs`. If you extend this +//! file, port the changes to `crd_grafana.rs` first, then dedupe — don't +//! spread the rot. + use std::collections::BTreeMap; use kube::CustomResource; diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index 7f07d5a..aa08e7a 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -1,6 +1,7 @@ pub mod alert_channel; pub mod alert_rule; pub mod application_monitoring; +pub mod cluster_dashboards; pub mod grafana; pub mod kube_prometheus; pub mod ntfy; diff --git a/harmony/src/modules/monitoring/prometheus/prometheus.rs b/harmony/src/modules/monitoring/prometheus/prometheus.rs index 2fe0d06..4904b4d 100644 --- a/harmony/src/modules/monitoring/prometheus/prometheus.rs +++ b/harmony/src/modules/monitoring/prometheus/prometheus.rs @@ -114,7 +114,7 @@ impl Prometheus { }; if let Some(ns) = namespace.as_deref() { - grafana_helm_chart_score(ns, false) + grafana_helm_chart_score(ns, false, None) .interpret(inventory, topology) .await } else { diff --git a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs index 586029b..136e1a2 100644 --- a/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs +++ b/harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs @@ -542,14 +542,7 @@ impl K8sPrometheusCRDAlertingInterpret { labels: Some(label.clone()), ..Default::default() }, - spec: GrafanaSpec { - config: None, - admin_user: None, - admin_password: None, - ingress: None, - persistence: None, - resources: None, - }, + spec: GrafanaSpec::default(), }; client .apply(&grafana, Some(&self.sender.namespace.clone())) diff --git a/harmony/src/modules/prometheus/rhob_alerting_score.rs b/harmony/src/modules/prometheus/rhob_alerting_score.rs index 8a85d1b..1d31a71 100644 --- a/harmony/src/modules/prometheus/rhob_alerting_score.rs +++ b/harmony/src/modules/prometheus/rhob_alerting_score.rs @@ -12,6 +12,9 @@ use std::process::Command; use crate::modules::k8s::ingress::{K8sIngressScore, PathType}; use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard; use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; +// NOTE: `rhob_grafana` is a stale, incorrect duplicate of `crd_grafana`. +// See the warning at the top of `rhob_grafana.rs`. Prefer `crd_grafana` +// for any new work. use crate::modules::monitoring::kube_prometheus::crd::rhob_grafana::{ Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig, GrafanaDatasourceSpec, GrafanaSpec,