feat/cluster-dashboard #250
39
Cargo.lock
generated
39
Cargo.lock
generated
@@ -1262,22 +1262,6 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brocade-switch-oricom-configuration"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"brocade",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.2"
|
||||
@@ -2650,6 +2634,29 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-cluster-dashboards"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-grafana"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-harmony-sso"
|
||||
version = "0.1.0"
|
||||
|
||||
14
examples/cluster_dashboards/Cargo.toml
Normal file
14
examples/cluster_dashboards/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "example-cluster-dashboards"
|
||||
edition = "2021"
|
||||
version = "0.1.0"
|
||||
license = "GNU AGPL v3"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||
log = "0.4"
|
||||
env_logger = "0.11"
|
||||
20
examples/cluster_dashboards/src/main.rs
Normal file
20
examples/cluster_dashboards/src/main.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use harmony::{
|
||||
inventory::Inventory, modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(cluster_dashboards_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
13
examples/grafana/Cargo.toml
Normal file
13
examples/grafana/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "example-grafana"
|
||||
edition = "2021"
|
||||
version = "0.1.0"
|
||||
license = "GNU AGPL v3"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||
log = "0.4"
|
||||
5
examples/grafana/env.sh
Normal file
5
examples/grafana/env.sh
Normal file
@@ -0,0 +1,5 @@
|
||||
export HARMONY_SECRET_NAMESPACE=example-grafana
|
||||
export HARMONY_SECRET_STORE=file
|
||||
export HARMONY_DATABASE_URL=sqlite://harmony_grafana.sqlite
|
||||
export RUST_LOG=harmony=debug
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
31
examples/grafana/src/main.rs
Normal file
31
examples/grafana/src/main.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::{
|
||||
cluster_dashboards::ClusterDashboardsScore,
|
||||
grafana::helm::helm_grafana::GrafanaOperatorScore,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
const GRAFANA_OPERATOR_CHART_VERSION: &str = "v5.22.2";
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let grafana_operator =
|
||||
GrafanaOperatorScore::new("grafana", Some(GRAFANA_OPERATOR_CHART_VERSION));
|
||||
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![
|
||||
Box::new(grafana_operator),
|
||||
Box::new(cluster_dashboards_score),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
117
harmony-k8s/src/domain.rs
Normal file
117
harmony-k8s/src/domain.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use kube::Error;
|
||||
use kube::api::GroupVersionKind;
|
||||
use log::{debug, trace, warn};
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::types::KubernetesDistribution;
|
||||
|
||||
impl K8sClient {
|
||||
/// Resolve an external hostname for the given service name by querying the
|
||||
/// cluster's ingress infrastructure.
|
||||
///
|
||||
/// Detection order:
|
||||
/// 1. **OpenShift** — reads `status.domain` from the default
|
||||
/// `IngressController` in `openshift-ingress-operator`.
|
||||
/// 2. **NGINX Ingress Controller** — looks for well-known Services in
|
||||
/// common namespaces and extracts the LoadBalancer hostname.
|
||||
/// 3. **Fallback** — returns internal cluster DNS
|
||||
/// (`{service}.default.svc.cluster.local`).
|
||||
pub async fn get_domain(&self, service: &str) -> Result<String, Error> {
|
||||
let distribution = self.get_k8s_distribution().await?;
|
||||
|
||||
if matches!(distribution, KubernetesDistribution::OpenshiftFamily) {
|
||||
if let Some(domain) = self.try_openshift_ingress_domain().await? {
|
||||
return Ok(format!("{service}.{domain}"));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(domain) = self.try_nginx_lb_domain().await? {
|
||||
return Ok(format!("{service}.{domain}"));
|
||||
}
|
||||
|
||||
warn!("Could not determine external ingress domain; falling back to internal-only DNS");
|
||||
Ok(format!("{service}.default.svc.cluster.local"))
|
||||
}
|
||||
|
||||
async fn try_openshift_ingress_domain(&self) -> Result<Option<String>, Error> {
|
||||
let gvk = GroupVersionKind {
|
||||
group: "operator.openshift.io".into(),
|
||||
version: "v1".into(),
|
||||
kind: "IngressController".into(),
|
||||
};
|
||||
|
||||
let ic = match self
|
||||
.get_resource_json_value("default", Some("openshift-ingress-operator"), &gvk)
|
||||
.await
|
||||
{
|
||||
Ok(ic) => ic,
|
||||
Err(e) => {
|
||||
debug!("Could not fetch OpenShift IngressController: {e}");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let replicas = ic.data["status"]["availableReplicas"].as_i64().unwrap_or(0);
|
||||
if replicas < 1 {
|
||||
debug!("OpenShift IngressController present but no available replicas");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if let Some(domain) = ic.data["status"]["domain"].as_str() {
|
||||
trace!("OpenShift IngressController domain: {domain}");
|
||||
return Ok(Some(domain.to_string()));
|
||||
}
|
||||
|
||||
warn!("OpenShift IngressController present but no status.domain set");
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
async fn try_nginx_lb_domain(&self) -> Result<Option<String>, Error> {
|
||||
let svc_gvk = GroupVersionKind {
|
||||
group: "".into(),
|
||||
version: "v1".into(),
|
||||
kind: "Service".into(),
|
||||
};
|
||||
|
||||
let candidates = [
|
||||
("ingress-nginx", "ingress-nginx-controller"),
|
||||
("ingress-nginx", "ingress-nginx-controller-internal"),
|
||||
("ingress-nginx", "ingress-nginx"),
|
||||
("kube-system", "ingress-nginx-controller"),
|
||||
];
|
||||
|
||||
for (ns, name) in candidates {
|
||||
trace!("Checking NGINX Service {ns}/{name} for LoadBalancer hostname");
|
||||
if let Ok(svc) = self.get_resource_json_value(name, Some(ns), &svc_gvk).await {
|
||||
let lb_hosts = svc.data["status"]["loadBalancer"]["ingress"]
|
||||
.as_array()
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
for entry in lb_hosts {
|
||||
if let Some(host) = entry.get("hostname").and_then(|v| v.as_str()) {
|
||||
debug!("Found NGINX LB hostname: {host}");
|
||||
if let Some(domain) = extract_base_domain(host) {
|
||||
return Ok(Some(domain));
|
||||
} else {
|
||||
return Ok(Some(host.to_string()));
|
||||
}
|
||||
}
|
||||
if let Some(ip) = entry.get("ip").and_then(|v| v.as_str()) {
|
||||
debug!("NGINX LB exposes IP {ip} (no hostname); skipping");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_base_domain(host: &str) -> Option<String> {
|
||||
let parts: Vec<&str> = host.split('.').collect();
|
||||
if parts.len() >= 2 {
|
||||
Some(parts[parts.len() - 2..].join("."))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ pub mod bundle;
|
||||
pub mod client;
|
||||
pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod domain;
|
||||
pub mod helper;
|
||||
pub mod node;
|
||||
pub mod pod;
|
||||
|
||||
@@ -742,18 +742,17 @@ impl K8sAnywhereTopology {
|
||||
labels: Some(labels.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaSpec {
|
||||
config: None,
|
||||
admin_user: None,
|
||||
admin_password: None,
|
||||
ingress: None,
|
||||
persistence: None,
|
||||
resources: None,
|
||||
},
|
||||
spec: GrafanaSpec::default(),
|
||||
};
|
||||
grafana
|
||||
}
|
||||
|
||||
// NOTE: This creates a harmony-owned Ingress resource, separate from the
|
||||
// grafana-operator. The newer pattern (used in `ClusterDashboardsScore`)
|
||||
// delegates Ingress creation to grafana-operator via `.spec.ingress` on
|
||||
// the Grafana CR, using `K8sClient::get_domain()` for hostname
|
||||
// resolution. This method is kept for backward compatibility with the
|
||||
// `install_grafana()` flow.
|
||||
async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore {
|
||||
let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap();
|
||||
let name = format!("{}-grafana", ns);
|
||||
@@ -1083,7 +1082,7 @@ impl K8sAnywhereTopology {
|
||||
if tenant.is_some() {
|
||||
namespace_scope = true;
|
||||
}
|
||||
let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope)
|
||||
let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope, None)
|
||||
.interpret(inventory, self)
|
||||
.await
|
||||
.map_err(|e| PreparationError::new(e.to_string()));
|
||||
@@ -1317,134 +1316,18 @@ impl TenantManager for K8sAnywhereTopology {
|
||||
#[async_trait]
|
||||
impl Ingress for K8sAnywhereTopology {
|
||||
async fn get_domain(&self, service: &str) -> Result<String, PreparationError> {
|
||||
use log::{trace, warn};
|
||||
|
||||
let client = self.k8s_client().await?;
|
||||
|
||||
// k3d local-dev shortcut (topology-specific state not available on K8sClient)
|
||||
if let Some(Some(k8s_state)) = self.k8s_state.get() {
|
||||
match k8s_state.source {
|
||||
K8sSource::LocalK3d => {
|
||||
// Local developer UX
|
||||
if matches!(k8s_state.source, K8sSource::LocalK3d) {
|
||||
return Ok(format!("{service}.local.k3d"));
|
||||
}
|
||||
K8sSource::Kubeconfig => {
|
||||
trace!("K8sSource is kubeconfig; attempting to detect domain");
|
||||
}
|
||||
|
||||
// 1) Try OpenShift IngressController domain (backward compatible)
|
||||
if self.openshift_ingress_operator_available().await.is_ok() {
|
||||
trace!("OpenShift ingress operator detected; using IngressController");
|
||||
let gvk = GroupVersionKind {
|
||||
group: "operator.openshift.io".into(),
|
||||
version: "v1".into(),
|
||||
kind: "IngressController".into(),
|
||||
};
|
||||
let ic = client
|
||||
.get_resource_json_value(
|
||||
"default",
|
||||
Some("openshift-ingress-operator"),
|
||||
&gvk,
|
||||
)
|
||||
let client = self.k8s_client().await?;
|
||||
client
|
||||
.get_domain(service)
|
||||
.await
|
||||
.map_err(|_| {
|
||||
PreparationError::new(
|
||||
"Failed to fetch IngressController".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(domain) = ic.data["status"]["domain"].as_str() {
|
||||
return Ok(format!("{service}.{domain}"));
|
||||
} else {
|
||||
warn!("OpenShift IngressController present but no status.domain set");
|
||||
}
|
||||
} else {
|
||||
trace!(
|
||||
"OpenShift ingress operator not detected; trying generic Kubernetes"
|
||||
);
|
||||
}
|
||||
|
||||
// 2) Try NGINX Ingress Controller common setups
|
||||
// 2.a) Well-known namespace/name for the controller Service
|
||||
// - upstream default: namespace "ingress-nginx", service "ingress-nginx-controller"
|
||||
// - some distros: "ingress-nginx-controller" svc in "ingress-nginx" ns
|
||||
// If found with LoadBalancer ingress hostname, use its base domain.
|
||||
if let Some(domain) = try_nginx_lb_domain(&client).await? {
|
||||
return Ok(format!("{service}.{domain}"));
|
||||
}
|
||||
|
||||
// 3) Fallback: internal cluster DNS suffix (service.namespace.svc.cluster.local)
|
||||
// We don't have tenant namespace here, so we fallback to 'default' with a warning.
|
||||
warn!(
|
||||
"Could not determine external ingress domain; falling back to internal-only DNS"
|
||||
);
|
||||
let internal = format!("{service}.default.svc.cluster.local");
|
||||
Ok(internal)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Err(PreparationError::new(
|
||||
"Cannot get domain: unable to detect K8s state".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_nginx_lb_domain(client: &K8sClient) -> Result<Option<String>, PreparationError> {
|
||||
use log::{debug, trace};
|
||||
|
||||
// Try common service path: svc/ingress-nginx-controller in ns/ingress-nginx
|
||||
let svc_gvk = GroupVersionKind {
|
||||
group: "".into(), // core
|
||||
version: "v1".into(),
|
||||
kind: "Service".into(),
|
||||
};
|
||||
|
||||
let candidates = [
|
||||
("ingress-nginx", "ingress-nginx-controller"),
|
||||
("ingress-nginx", "ingress-nginx-controller-internal"),
|
||||
("ingress-nginx", "ingress-nginx"), // some charts name the svc like this
|
||||
("kube-system", "ingress-nginx-controller"), // less common but seen
|
||||
];
|
||||
|
||||
for (ns, name) in candidates {
|
||||
trace!("Checking NGINX Service {ns}/{name} for LoadBalancer hostname");
|
||||
if let Ok(svc) = client
|
||||
.get_resource_json_value(ns, Some(name), &svc_gvk)
|
||||
.await
|
||||
{
|
||||
let lb_hosts = svc.data["status"]["loadBalancer"]["ingress"]
|
||||
.as_array()
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
for entry in lb_hosts {
|
||||
if let Some(host) = entry.get("hostname").and_then(|v| v.as_str()) {
|
||||
debug!("Found NGINX LB hostname: {host}");
|
||||
if let Some(domain) = extract_base_domain(host) {
|
||||
return Ok(Some(domain.to_string()));
|
||||
} else {
|
||||
return Ok(Some(host.to_string())); // already a domain
|
||||
}
|
||||
}
|
||||
if let Some(ip) = entry.get("ip").and_then(|v| v.as_str()) {
|
||||
// If only an IP is exposed, we can't create a hostname; return None to keep searching
|
||||
debug!("NGINX LB exposes IP {ip} (no hostname); skipping");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn extract_base_domain(host: &str) -> Option<String> {
|
||||
// For a host like a1b2c3d4e5f6abcdef.elb.amazonaws.com -> base domain elb.amazonaws.com
|
||||
// For a managed DNS like xyz.example.com -> base domain example.com (keep 2+ labels)
|
||||
// Heuristic: keep last 2 labels by default; special-case known multi-label TLDs if needed.
|
||||
let parts: Vec<&str> = host.split('.').collect();
|
||||
if parts.len() >= 2 {
|
||||
// Very conservative: last 2 labels
|
||||
Some(parts[parts.len() - 2..].join("."))
|
||||
} else {
|
||||
None
|
||||
.map_err(|e| PreparationError::new(e.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -60,7 +60,69 @@ impl<T: Topology + HelmCommand> Score<T> for HelmChartScore {
|
||||
pub struct HelmChartInterpret {
|
||||
pub score: HelmChartScore,
|
||||
}
|
||||
#[derive(serde::Deserialize)]
|
||||
struct HelmListEntry {
|
||||
name: String,
|
||||
chart: String,
|
||||
}
|
||||
|
||||
impl HelmChartInterpret {
|
||||
fn find_installed_release<T: HelmCommand>(
|
||||
&self,
|
||||
topology: &T,
|
||||
ns: &str,
|
||||
) -> Result<Option<String>, InterpretError> {
|
||||
let release = self.score.release_name.to_string();
|
||||
let filter = format!("^{}$", release);
|
||||
let args = vec!["list", "--namespace", ns, "--filter", &filter, "-o", "json"];
|
||||
let output = run_helm_command(topology, &args)?;
|
||||
if !output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"helm list failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
let entries: Vec<HelmListEntry> = serde_json::from_slice(&output.stdout)
|
||||
.map_err(|e| InterpretError::new(format!("parse helm list output: {e}")))?;
|
||||
Ok(entries
|
||||
.into_iter()
|
||||
.find(|e| e.name == release)
|
||||
.map(|e| e.chart))
|
||||
}
|
||||
|
||||
fn expected_chart_field(&self) -> Option<String> {
|
||||
let version = self.score.chart_version.as_ref()?.to_string();
|
||||
let short = self
|
||||
.score
|
||||
.chart_name
|
||||
.to_string()
|
||||
.rsplit('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
Some(format!(
|
||||
"{short}-{}",
|
||||
version.strip_prefix('v').unwrap_or(&version)
|
||||
))
|
||||
}
|
||||
|
||||
fn normalize_chart_field(s: &str) -> String {
|
||||
// Helm strips a leading `v` from chart versions in the `chart` column
|
||||
// (normalized to semver). Users often write `v5.22.2` on the score.
|
||||
// Normalize both sides by dropping a `-v` → `-` before the version.
|
||||
match s.rfind("-v") {
|
||||
Some(i)
|
||||
if s[i + 2..]
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(|c| c.is_ascii_digit()) =>
|
||||
{
|
||||
format!("{}-{}", &s[..i], &s[i + 2..])
|
||||
}
|
||||
_ => s.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_repo<T: HelmCommand>(&self, topology: &T) -> Result<(), InterpretError> {
|
||||
let repo = match &self.score.repository {
|
||||
Some(repo) => repo,
|
||||
@@ -142,6 +204,41 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret {
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| todo!("Get namespace from active kubernetes cluster"));
|
||||
|
||||
let ns_str = ns.to_string();
|
||||
if let Some(installed_chart) = self.find_installed_release(topology, &ns_str)? {
|
||||
return match self.expected_chart_field() {
|
||||
Some(expected)
|
||||
if Self::normalize_chart_field(&expected)
|
||||
== Self::normalize_chart_field(&installed_chart) =>
|
||||
{
|
||||
warn!(
|
||||
"Helm release '{}' already installed at desired version ('{}'); skipping.",
|
||||
self.score.release_name, installed_chart
|
||||
);
|
||||
Ok(Outcome::success(format!(
|
||||
"Helm Chart {} already at desired version",
|
||||
self.score.release_name
|
||||
)))
|
||||
}
|
||||
Some(expected) => Err(InterpretError::new(format!(
|
||||
"Helm release '{}' already installed as '{}', but score requests '{}'. \
|
||||
Refusing to upgrade/downgrade; resolve manually.",
|
||||
self.score.release_name, installed_chart, expected
|
||||
))),
|
||||
None => {
|
||||
warn!(
|
||||
"Helm release '{}' already installed as '{}'; score has no pinned \
|
||||
chart_version so skipping re-install.",
|
||||
self.score.release_name, installed_chart
|
||||
);
|
||||
Ok(Outcome::success(format!(
|
||||
"Helm Chart {} already installed (version not pinned)",
|
||||
self.score.release_name
|
||||
)))
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
self.add_repo(topology)?;
|
||||
|
||||
let mut args = if self.score.install_only {
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
# These are probably already created by rook-ceph operator, not sure, needs to validate.
|
||||
# in fact, 100% sure for the second one (rook-ceph-exporter)
|
||||
# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working
|
||||
# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-mgr
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This specific label is what tells OKD's Prometheus to pick this up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
# This matches your 'rook-ceph-mgr' service
|
||||
app: rook-ceph-mgr
|
||||
endpoints:
|
||||
- port: ""
|
||||
# The port name in your service is empty/integers, so we use targetPort
|
||||
targetPort: 9283
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-exporter
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This label is required for OKD cluster-wide monitoring to pick it up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
team: rook
|
||||
spec:
|
||||
endpoints:
|
||||
- honorLabels: true
|
||||
interval: 10s
|
||||
path: /metrics
|
||||
port: ceph-exporter-http-metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rook-ceph-exporter
|
||||
rook_cluster: rook-ceph
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints", "pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: rook-ceph-metrics-viewer
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: openshift-monitoring
|
||||
@@ -0,0 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: rook-ceph
|
||||
labels:
|
||||
# This is the critical label that allows OKD Prometheus to see the namespace
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,731 @@
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,739 @@
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,742 @@
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,734 @@
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,945 @@
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,627 @@
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,596 @@
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,773 @@
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access
|
||||
rules:
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- prometheuses/api
|
||||
verbs:
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-prometheus-api-access
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-cluster-monitoring-view
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-monitoring-view
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: observability
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
serviceAccountName: cluster-grafana-sa
|
||||
automountServiceAccountToken: true
|
||||
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
|
||||
security:
|
||||
admin_user: admin
|
||||
admin_password: paul
|
||||
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
|
||||
auth.anonymous:
|
||||
enabled: "true"
|
||||
org_role: Viewer
|
||||
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-prometheus-token
|
||||
namespace: observability
|
||||
annotations:
|
||||
kubernetes.io/service-account.name: cluster-grafana-sa
|
||||
type: kubernetes.io/service-account-token
|
||||
@@ -0,0 +1,27 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "Bearer ${token}"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: observability
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: cluster-grafana-service
|
||||
port:
|
||||
targetPort: 3000
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
@@ -0,0 +1,97 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster CPU Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster Memory Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||||
}
|
||||
]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,769 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,637 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-node-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,783 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-workload-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,955 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-networking
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,607 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: storage-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,744 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-etcd
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,752 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-control-plane-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,741 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-alerts-events
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
mod score;
|
||||
pub use score::ClusterDashboardsScore;
|
||||
557
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
557
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
@@ -0,0 +1,557 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::{Namespace, Secret};
|
||||
use kube::api::ObjectMeta;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use harmony_k8s::KubernetesDistribution;
|
||||
use log::debug;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
modules::monitoring::kube_prometheus::crd::crd_grafana::{
|
||||
Grafana, GrafanaContainer, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource,
|
||||
GrafanaDatasourceConfig, GrafanaDatasourceJsonData, GrafanaDatasourceSecureJsonData,
|
||||
GrafanaDatasourceSpec, GrafanaDeployment, GrafanaDeploymentSpec, GrafanaIngress,
|
||||
GrafanaIngressBackend, GrafanaIngressBackendService, GrafanaIngressPath,
|
||||
GrafanaIngressRule, GrafanaIngressRuleHttp, GrafanaIngressServicePort, GrafanaIngressSpec,
|
||||
GrafanaPodSpec, GrafanaPodTemplate, GrafanaRoute, GrafanaRoutePort, GrafanaRouteSpec,
|
||||
GrafanaRouteTarget, GrafanaRouteTls, GrafanaSecretKeyRef, GrafanaSpec, GrafanaValueFrom,
|
||||
GrafanaValueSource, ResourceRequirements,
|
||||
},
|
||||
modules::monitoring::kube_prometheus::crd::crd_prometheuses::LabelSelector,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ClusterDashboardsScore {
|
||||
pub namespace: String,
|
||||
pub grafana_admin_user: String,
|
||||
pub grafana_admin_password: String,
|
||||
}
|
||||
|
||||
impl Default for ClusterDashboardsScore {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
namespace: "harmony-observability".to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsScore {
|
||||
pub fn new(namespace: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_credentials(namespace: &str, admin_user: &str, admin_password: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: admin_user.to_string(),
|
||||
grafana_admin_password: admin_password.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for ClusterDashboardsScore {
|
||||
fn name(&self) -> String {
|
||||
format!("ClusterDashboardsScore({})", self.namespace)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(ClusterDashboardsInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
grafana_admin_user: self.grafana_admin_user.clone(),
|
||||
grafana_admin_password: self.grafana_admin_password.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterDashboardsInterpret {
|
||||
namespace: String,
|
||||
grafana_admin_user: String,
|
||||
grafana_admin_password: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
self.create_namespace(inventory, topology).await?;
|
||||
self.create_rbac_resources(inventory, topology).await?;
|
||||
self.create_secret(inventory, topology).await?;
|
||||
self.create_grafana(inventory, topology).await?;
|
||||
self.create_datasource(inventory, topology).await?;
|
||||
self.create_dashboards(inventory, topology).await?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||
self.namespace, 8
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsInterpret {
|
||||
async fn create_namespace(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert(
|
||||
"openshift.io/cluster-monitoring".to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
|
||||
let namespace = Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..Namespace::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(namespace, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_rbac_resources(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "grafana-prometheus-datasource-sa".to_string();
|
||||
let rbac_namespace = self.namespace.clone();
|
||||
|
||||
let service_account = {
|
||||
use k8s_openapi::api::core::v1::ServiceAccount;
|
||||
ServiceAccount {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(service_account_name.clone()),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..ServiceAccount::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRole, PolicyRule};
|
||||
ClusterRole {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["monitoring.coreos.com".to_string()]),
|
||||
resources: Some(vec!["prometheuses/api".to_string()]),
|
||||
verbs: vec!["get".to_string()],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..ClusterRole::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access-binding".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "grafana-prometheus-api-access".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding_cluster_monitoring = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-cluster-monitoring-view".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "cluster-monitoring-view".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
K8sResourceScore::single(service_account, Some(rbac_namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding_cluster_monitoring, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_secret(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "grafana-prometheus-datasource-sa".to_string();
|
||||
let secret_name = "grafana-prometheus-token".to_string();
|
||||
let secret_namespace = self.namespace.clone();
|
||||
|
||||
let secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(secret_name),
|
||||
namespace: Some(secret_namespace),
|
||||
annotations: Some({
|
||||
let mut ann = BTreeMap::new();
|
||||
ann.insert(
|
||||
"kubernetes.io/service-account.name".to_string(),
|
||||
service_account_name,
|
||||
);
|
||||
ann
|
||||
}),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
type_: Some("kubernetes.io/service-account-token".to_string()),
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(secret, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_grafana(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> =
|
||||
[("dashboards".to_string(), "grafana".to_string())].into();
|
||||
|
||||
let mut config: BTreeMap<String, BTreeMap<String, String>> = BTreeMap::new();
|
||||
config.insert("log".into(), [("mode".into(), "console".into())].into());
|
||||
config.insert(
|
||||
"security".into(),
|
||||
[
|
||||
("admin_user".into(), self.grafana_admin_user.clone()),
|
||||
("admin_password".into(), self.grafana_admin_password.clone()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
config.insert(
|
||||
"users".into(),
|
||||
[("viewers_can_edit".into(), "false".into())].into(),
|
||||
);
|
||||
config.insert(
|
||||
"auth".into(),
|
||||
[("disable_login_form".into(), "false".into())].into(),
|
||||
);
|
||||
config.insert(
|
||||
"auth.anonymous".into(),
|
||||
[
|
||||
("enabled".into(), "true".into()),
|
||||
("org_role".into(), "Viewer".into()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
|
||||
let resources = ResourceRequirements {
|
||||
requests: [
|
||||
("cpu".into(), "500m".into()),
|
||||
("memory".into(), "1Gi".into()),
|
||||
]
|
||||
.into(),
|
||||
limits: [("cpu".into(), "1".into()), ("memory".into(), "2Gi".into())].into(),
|
||||
};
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
let distribution = client
|
||||
.get_k8s_distribution()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to detect k8s distribution: {e}")))?;
|
||||
|
||||
// OpenShift → Route (operator-managed); plain k8s → Ingress (operator-managed).
|
||||
let (route, ingress) = if matches!(distribution, KubernetesDistribution::OpenshiftFamily) {
|
||||
debug!("OpenShift detected; Grafana CR will use .spec.route");
|
||||
let route = GrafanaRoute {
|
||||
spec: Some(GrafanaRouteSpec {
|
||||
port: Some(GrafanaRoutePort { target_port: 3000 }),
|
||||
tls: Some(GrafanaRouteTls {
|
||||
termination: Some("edge".to_string()),
|
||||
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
||||
}),
|
||||
to: Some(GrafanaRouteTarget {
|
||||
kind: "Service".to_string(),
|
||||
name: "cluster-grafana-service".to_string(),
|
||||
weight: Some(100),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
(Some(route), None)
|
||||
} else {
|
||||
let hostname = client
|
||||
.get_domain("cluster-grafana")
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to resolve domain: {e}")))?;
|
||||
debug!("Non-OpenShift detected; Grafana CR will use .spec.ingress (host: {hostname})");
|
||||
let ingress = GrafanaIngress {
|
||||
spec: Some(GrafanaIngressSpec {
|
||||
ingress_class_name: None,
|
||||
rules: Some(vec![GrafanaIngressRule {
|
||||
host: Some(hostname),
|
||||
http: Some(GrafanaIngressRuleHttp {
|
||||
paths: vec![GrafanaIngressPath {
|
||||
path: "/".to_string(),
|
||||
path_type: "Prefix".to_string(),
|
||||
backend: GrafanaIngressBackend {
|
||||
service: GrafanaIngressBackendService {
|
||||
name: "cluster-grafana-service".to_string(),
|
||||
port: GrafanaIngressServicePort { number: 3000 },
|
||||
},
|
||||
},
|
||||
}],
|
||||
}),
|
||||
}]),
|
||||
}),
|
||||
};
|
||||
(None, Some(ingress))
|
||||
};
|
||||
|
||||
let grafana = Grafana {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("cluster-grafana".to_string()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: GrafanaSpec {
|
||||
config: Some(config),
|
||||
deployment: Some(GrafanaDeployment {
|
||||
spec: Some(GrafanaDeploymentSpec {
|
||||
replicas: Some(1),
|
||||
template: Some(GrafanaPodTemplate {
|
||||
spec: Some(GrafanaPodSpec {
|
||||
containers: vec![GrafanaContainer {
|
||||
name: "grafana".to_string(),
|
||||
resources: Some(resources),
|
||||
}],
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
route,
|
||||
ingress,
|
||||
},
|
||||
};
|
||||
|
||||
K8sResourceScore::single(grafana, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_datasource(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> =
|
||||
[("datasource".to_string(), "prometheus".to_string())].into();
|
||||
|
||||
let instance_selector = LabelSelector {
|
||||
match_labels: [("dashboards".to_string(), "grafana".to_string())].into(),
|
||||
match_expressions: vec![],
|
||||
};
|
||||
|
||||
let datasource = GrafanaDatasource {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("prometheus-cluster".to_string()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: GrafanaDatasourceSpec {
|
||||
instance_selector,
|
||||
allow_cross_namespace_import: None,
|
||||
datasource: GrafanaDatasourceConfig {
|
||||
name: "Prometheus-Cluster".to_string(),
|
||||
r#type: "prometheus".to_string(),
|
||||
access: "proxy".to_string(),
|
||||
url: "https://prometheus-k8s.openshift-monitoring.svc:9091".to_string(),
|
||||
database: None,
|
||||
is_default: Some(true),
|
||||
editable: None,
|
||||
json_data: Some(GrafanaDatasourceJsonData {
|
||||
http_header_name1: Some("Authorization".to_string()),
|
||||
tls_skip_verify: Some(true),
|
||||
time_interval: Some("30s".to_string()),
|
||||
oauth_pass_thru: None,
|
||||
}),
|
||||
secure_json_data: Some(GrafanaDatasourceSecureJsonData {
|
||||
// Placeholder; real value comes from `values_from` at
|
||||
// reconcile time (see below).
|
||||
http_header_value1: Some("Bearer ${token}".to_string()),
|
||||
}),
|
||||
},
|
||||
values_from: Some(vec![GrafanaValueFrom {
|
||||
target_path: "secureJsonData.httpHeaderValue1".to_string(),
|
||||
value_from: GrafanaValueSource {
|
||||
secret_key_ref: GrafanaSecretKeyRef {
|
||||
name: "grafana-prometheus-token".to_string(),
|
||||
key: "token".to_string(),
|
||||
},
|
||||
},
|
||||
}]),
|
||||
},
|
||||
};
|
||||
|
||||
K8sResourceScore::single(datasource, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_dashboards(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let dashboards: &[(&str, &str)] = &[
|
||||
(
|
||||
"okd-cluster-overview",
|
||||
include_str!("dashboards/cluster-overview.json"),
|
||||
),
|
||||
(
|
||||
"okd-node-health",
|
||||
include_str!("dashboards/nodes-health.json"),
|
||||
),
|
||||
(
|
||||
"okd-workload-health",
|
||||
include_str!("dashboards/workloads-health.json"),
|
||||
),
|
||||
("okd-networking", include_str!("dashboards/networking.json")),
|
||||
("storage-health", include_str!("dashboards/storage.json")),
|
||||
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||
(
|
||||
"okd-control-plane",
|
||||
include_str!("dashboards/control-plane.json"),
|
||||
),
|
||||
(
|
||||
"okd-alerts-events",
|
||||
include_str!("dashboards/alerts-events-problems.json"),
|
||||
),
|
||||
];
|
||||
|
||||
for (dashboard_name, json_content) in dashboards {
|
||||
let labels: BTreeMap<String, String> =
|
||||
[("dashboard".to_string(), dashboard_name.to_string())].into();
|
||||
|
||||
let instance_selector = LabelSelector {
|
||||
match_labels: [("dashboards".to_string(), "grafana".to_string())].into(),
|
||||
match_expressions: vec![],
|
||||
};
|
||||
|
||||
let dashboard = GrafanaDashboard {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(dashboard_name.to_string()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: GrafanaDashboardSpec {
|
||||
instance_selector,
|
||||
json: Some(json_content.to_string()),
|
||||
resync_period: None,
|
||||
datasources: None,
|
||||
grafana_com: None,
|
||||
},
|
||||
};
|
||||
|
||||
K8sResourceScore::single(dashboard, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,29 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::KubernetesDistribution;
|
||||
use harmony_macros::hurl;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, RoleRef, Subject};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use log::debug;
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
|
||||
use crate::modules::helm::chart::{HelmChartScore, HelmRepository};
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::helm::chart::{HelmChartScore, HelmRepository},
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartScore {
|
||||
pub fn grafana_helm_chart_score(
|
||||
ns: &str,
|
||||
namespace_scope: bool,
|
||||
chart_version: Option<&str>,
|
||||
) -> HelmChartScore {
|
||||
let mut values_overrides = HashMap::new();
|
||||
values_overrides.insert(
|
||||
NonBlankString::from_str("namespaceScope").unwrap(),
|
||||
@@ -14,7 +33,7 @@ pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartSco
|
||||
namespace: Some(NonBlankString::from_str(ns).unwrap()),
|
||||
release_name: NonBlankString::from_str("grafana-operator").unwrap(),
|
||||
chart_name: NonBlankString::from_str("grafana/grafana-operator").unwrap(),
|
||||
chart_version: None,
|
||||
chart_version: chart_version.map(|v| NonBlankString::from_str(v).unwrap()),
|
||||
values_overrides: Some(values_overrides),
|
||||
values_yaml: None,
|
||||
create_namespace: true,
|
||||
@@ -26,3 +45,173 @@ pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartSco
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Cluster-scoped RBAC so grafana-operator can watch `route.openshift.io/v1.Route`.
|
||||
/// The upstream chart's ClusterRole doesn't include these verbs and the chart
|
||||
/// exposes no values key to extend it, so we apply them separately.
|
||||
///
|
||||
/// Safe on non-OpenShift clusters: Kubernetes accepts a `ClusterRole`
|
||||
/// referencing a missing API group — the rule is simply never matched — but
|
||||
/// `GrafanaOperatorScore` only applies these on detected OpenShift clusters.
|
||||
pub fn grafana_operator_openshift_route_rbac_scores(
|
||||
ns: &str,
|
||||
) -> (
|
||||
K8sResourceScore<ClusterRole>,
|
||||
K8sResourceScore<ClusterRoleBinding>,
|
||||
) {
|
||||
let cluster_role_name = "harmony-grafana-operator-openshift-routes".to_string();
|
||||
let cluster_role_binding_name = "harmony-grafana-operator-openshift-routes-binding".to_string();
|
||||
let operator_sa_name = "grafana-operator".to_string();
|
||||
|
||||
let cluster_role = ClusterRole {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(cluster_role_name.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["route.openshift.io".to_string()]),
|
||||
resources: Some(vec!["routes".to_string(), "routes/custom-host".to_string()]),
|
||||
verbs: vec![
|
||||
"get".to_string(),
|
||||
"list".to_string(),
|
||||
"watch".to_string(),
|
||||
"create".to_string(),
|
||||
"update".to_string(),
|
||||
"patch".to_string(),
|
||||
"delete".to_string(),
|
||||
],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..ClusterRole::default()
|
||||
};
|
||||
|
||||
let cluster_role_binding = ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(cluster_role_binding_name),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: operator_sa_name,
|
||||
namespace: Some(ns.to_string()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: cluster_role_name,
|
||||
},
|
||||
};
|
||||
|
||||
(
|
||||
K8sResourceScore::single(cluster_role, None),
|
||||
K8sResourceScore::single(cluster_role_binding, None),
|
||||
)
|
||||
}
|
||||
|
||||
/// Composite score: installs grafana-operator via Helm, and on OpenShift-family
|
||||
/// clusters also applies the `route.openshift.io` RBAC the operator needs to
|
||||
/// reconcile Routes. Distribution is detected at interpret time via the
|
||||
/// cluster's API discovery — no flag needed at call time.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct GrafanaOperatorScore {
|
||||
pub namespace: String,
|
||||
pub namespace_scope: bool,
|
||||
pub chart_version: Option<String>,
|
||||
}
|
||||
|
||||
impl GrafanaOperatorScore {
|
||||
pub fn new(namespace: &str, chart_version: Option<&str>) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
namespace_scope: false,
|
||||
chart_version: chart_version.map(|v| v.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + HelmCommand> Score<T> for GrafanaOperatorScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(GrafanaOperatorInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
namespace_scope: self.namespace_scope,
|
||||
chart_version: self.chart_version.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
format!("GrafanaOperatorScore({})", self.namespace)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct GrafanaOperatorInterpret {
|
||||
namespace: String,
|
||||
namespace_scope: bool,
|
||||
chart_version: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient + HelmCommand> Interpret<T> for GrafanaOperatorInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let distribution = client
|
||||
.get_k8s_distribution()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to detect k8s distribution: {e}")))?;
|
||||
|
||||
if matches!(distribution, KubernetesDistribution::OpenshiftFamily) {
|
||||
debug!(
|
||||
"OpenShift detected; applying grafana-operator Route RBAC in namespace {}",
|
||||
self.namespace
|
||||
);
|
||||
let (cr, crb) = grafana_operator_openshift_route_rbac_scores(&self.namespace);
|
||||
cr.create_interpret().execute(inventory, topology).await?;
|
||||
crb.create_interpret().execute(inventory, topology).await?;
|
||||
} else {
|
||||
debug!(
|
||||
"Non-OpenShift distribution ({:?}); skipping Route RBAC",
|
||||
distribution
|
||||
);
|
||||
}
|
||||
|
||||
let helm_score = grafana_helm_chart_score(
|
||||
&self.namespace,
|
||||
self.namespace_scope,
|
||||
self.chart_version.as_deref(),
|
||||
);
|
||||
helm_score
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"grafana-operator installed in namespace '{}' (distribution: {:?})",
|
||||
self.namespace, distribution
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("GrafanaOperator")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,13 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::crd_prometheuses::LabelSelector;
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
/// `Grafana` CR binding — audited against
|
||||
/// `grafanas.grafana.integreatly.org/v1beta1` on grafana-operator v5.22.
|
||||
/// Only the fields actively consumed by harmony callers are modeled.
|
||||
/// `.spec.config` is `map[string]map[string]string` upstream (grafana.ini
|
||||
/// sections); it is modeled as a nested `BTreeMap` rather than a struct to
|
||||
/// avoid losing sections like `auth.anonymous` (dotted keys).
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[kube(
|
||||
group = "grafana.integreatly.org",
|
||||
version = "v1beta1",
|
||||
@@ -16,81 +22,177 @@ use super::crd_prometheuses::LabelSelector;
|
||||
)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaSpec {
|
||||
/// `grafana.ini` content. Outer map key = section name (e.g. `security`,
|
||||
/// `auth.anonymous`); inner map = key/value pairs in that section.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub config: Option<GrafanaConfig>,
|
||||
pub config: Option<BTreeMap<String, BTreeMap<String, String>>>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub admin_user: Option<String>,
|
||||
pub deployment: Option<GrafanaDeployment>,
|
||||
|
||||
/// OpenShift-only: reconciled by grafana-operator when the
|
||||
/// `route.openshift.io` CRD is present.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub admin_password: Option<String>,
|
||||
pub route: Option<GrafanaRoute>,
|
||||
|
||||
/// Standard k8s Ingress: reconciled by grafana-operator on non-OpenShift
|
||||
/// clusters. Mutually exclusive with `route` in practice.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub ingress: Option<GrafanaIngress>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaDeployment {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub spec: Option<GrafanaDeploymentSpec>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaDeploymentSpec {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub replicas: Option<i32>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub persistence: Option<GrafanaPersistence>,
|
||||
pub template: Option<GrafanaPodTemplate>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaPodTemplate {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub spec: Option<GrafanaPodSpec>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaPodSpec {
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub containers: Vec<GrafanaContainer>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaContainer {
|
||||
pub name: String,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub resources: Option<ResourceRequirements>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaConfig {
|
||||
pub struct GrafanaRoute {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub log: Option<GrafanaLogConfig>,
|
||||
pub spec: Option<GrafanaRouteSpec>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaRouteSpec {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub port: Option<GrafanaRoutePort>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub security: Option<GrafanaSecurityConfig>,
|
||||
pub tls: Option<GrafanaRouteTls>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<GrafanaRouteTarget>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaLogConfig {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub mode: Option<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub level: Option<String>,
|
||||
pub struct GrafanaRoutePort {
|
||||
/// Upstream schema is int-or-string; we only use integer.
|
||||
pub target_port: i32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaSecurityConfig {
|
||||
pub struct GrafanaRouteTls {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub admin_user: Option<String>,
|
||||
pub termination: Option<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub admin_password: Option<String>,
|
||||
pub insecure_edge_termination_policy: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaRouteTarget {
|
||||
pub kind: String,
|
||||
pub name: String,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub weight: Option<i32>,
|
||||
}
|
||||
|
||||
// ---- Ingress types (mirrors standard k8s IngressSpec, narrow subset) ----
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngress {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub enabled: Option<bool>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub hosts: Option<Vec<String>>,
|
||||
pub spec: Option<GrafanaIngressSpec>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaPersistence {
|
||||
pub struct GrafanaIngressSpec {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub enabled: Option<bool>,
|
||||
pub ingress_class_name: Option<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub storage_class_name: Option<String>,
|
||||
pub rules: Option<Vec<GrafanaIngressRule>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressRule {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub host: Option<String>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub size: Option<String>,
|
||||
pub http: Option<GrafanaIngressRuleHttp>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressRuleHttp {
|
||||
pub paths: Vec<GrafanaIngressPath>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressPath {
|
||||
pub path: String,
|
||||
pub path_type: String,
|
||||
pub backend: GrafanaIngressBackend,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressBackend {
|
||||
pub service: GrafanaIngressBackendService,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressBackendService {
|
||||
pub name: String,
|
||||
pub port: GrafanaIngressServicePort,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaIngressServicePort {
|
||||
pub number: i32,
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[kube(
|
||||
group = "grafana.integreatly.org",
|
||||
version = "v1beta1",
|
||||
@@ -135,7 +237,7 @@ pub struct GrafanaCom {
|
||||
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[kube(
|
||||
group = "grafana.integreatly.org",
|
||||
version = "v1beta1",
|
||||
@@ -176,7 +278,7 @@ pub struct GrafanaSecretKeyRef {
|
||||
pub key: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Default, JsonSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct GrafanaDatasourceConfig {
|
||||
pub access: String,
|
||||
@@ -235,3 +337,23 @@ pub struct ResourceRequirements {
|
||||
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
|
||||
pub requests: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
// `Default` impls on the `CustomResource`-generated wrappers so they satisfy
|
||||
// the `K: Default` bound on `K8sResourceScore<K>`.
|
||||
impl Default for Grafana {
|
||||
fn default() -> Self {
|
||||
Grafana::new("", GrafanaSpec::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GrafanaDashboard {
|
||||
fn default() -> Self {
|
||||
GrafanaDashboard::new("", GrafanaDashboardSpec::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GrafanaDatasource {
|
||||
fn default() -> Self {
|
||||
GrafanaDatasource::new("", GrafanaDatasourceSpec::default())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,34 @@
|
||||
//! ⚠️ **STALE DUPLICATE — DO NOT COPY FROM**
|
||||
//!
|
||||
//! This file is a near-identical duplicate of `crd_grafana.rs` from before
|
||||
//! that file was audited against the upstream
|
||||
//! `grafanas.grafana.integreatly.org/v1beta1` schema (grafana-operator
|
||||
//! v5.22). Fields defined below are known to be **wrong** relative to
|
||||
//! upstream, in particular:
|
||||
//!
|
||||
//! - `GrafanaSpec.admin_user` / `admin_password` — do not exist at
|
||||
//! `.spec` top-level upstream; the real location is
|
||||
//! `.spec.config.security.admin_user/admin_password`.
|
||||
//! - `GrafanaSpec.persistence` — upstream key is `persistentVolumeClaim`,
|
||||
//! so writes here are silently dropped.
|
||||
//! - `GrafanaSpec.resources` — there is no `.spec.resources` upstream at
|
||||
//! all (container resources belong under
|
||||
//! `.spec.deployment.spec.template.spec.containers[].resources`).
|
||||
//! - `GrafanaSpec.ingress` — upstream `ingress` is `{ metadata, spec }`,
|
||||
//! not `{ enabled, hosts }` as modeled here.
|
||||
//! - `GrafanaConfig` as a typed struct — upstream `.spec.config` is
|
||||
//! `map[string]map[string]string` (grafana.ini sections). The struct
|
||||
//! form here cannot express sections like `auth.anonymous` (dotted
|
||||
//! keys) and loses anything beyond `log`/`security`.
|
||||
//!
|
||||
//! This file is kept only because `rhob_alerting_score.rs` still builds
|
||||
//! against it, and that caller happens to construct `GrafanaSpec` with
|
||||
//! every field set to `None` — so the bugs are latent, not active.
|
||||
//!
|
||||
//! If you need a correct binding, use `crd_grafana.rs`. If you extend this
|
||||
//! file, port the changes to `crd_grafana.rs` first, then dedupe — don't
|
||||
//! spread the rot.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use kube::CustomResource;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod application_monitoring;
|
||||
pub mod cluster_dashboards;
|
||||
pub mod grafana;
|
||||
pub mod kube_prometheus;
|
||||
pub mod ntfy;
|
||||
|
||||
@@ -114,7 +114,7 @@ impl Prometheus {
|
||||
};
|
||||
|
||||
if let Some(ns) = namespace.as_deref() {
|
||||
grafana_helm_chart_score(ns, false)
|
||||
grafana_helm_chart_score(ns, false, None)
|
||||
.interpret(inventory, topology)
|
||||
.await
|
||||
} else {
|
||||
|
||||
@@ -542,14 +542,7 @@ impl K8sPrometheusCRDAlertingInterpret {
|
||||
labels: Some(label.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaSpec {
|
||||
config: None,
|
||||
admin_user: None,
|
||||
admin_password: None,
|
||||
ingress: None,
|
||||
persistence: None,
|
||||
resources: None,
|
||||
},
|
||||
spec: GrafanaSpec::default(),
|
||||
};
|
||||
client
|
||||
.apply(&grafana, Some(&self.sender.namespace.clone()))
|
||||
|
||||
@@ -12,6 +12,9 @@ use std::process::Command;
|
||||
use crate::modules::k8s::ingress::{K8sIngressScore, PathType};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
|
||||
// NOTE: `rhob_grafana` is a stale, incorrect duplicate of `crd_grafana`.
|
||||
// See the warning at the top of `rhob_grafana.rs`. Prefer `crd_grafana`
|
||||
// for any new work.
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_grafana::{
|
||||
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
|
||||
GrafanaDatasourceSpec, GrafanaSpec,
|
||||
|
||||
Reference in New Issue
Block a user