Compare commits
11 Commits
example/vl
...
feat/clust
| Author | SHA1 | Date | |
|---|---|---|---|
| b1ff4e4a0f | |||
| ee8f033143 | |||
| 1298ac9a18 | |||
| 53e361e84e | |||
| 220e0c2bb8 | |||
| 82e47d22a2 | |||
| fb17d7ed40 | |||
| d4bf80779e | |||
| 28dadf3a70 | |||
| 15c454aa65 | |||
| f9a3e51529 |
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -7001,18 +7001,6 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "vllm"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"k8s-openapi",
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wait-timeout"
|
||||
version = "0.2.1"
|
||||
|
||||
@@ -18,7 +18,7 @@ members = [
|
||||
"adr/agent_discovery/mdns",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s", "examples/vllm",
|
||||
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
||||
16
examples/cluster_dashboards/Cargo.toml
Normal file
16
examples/cluster_dashboards/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[workspace]
|
||||
|
||||
[package]
|
||||
name = "example-cluster-dashboards"
|
||||
edition = "2021"
|
||||
version = "0.1.0"
|
||||
license = "GNU AGPL v3"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||
log = "0.4"
|
||||
env_logger = "0.11"
|
||||
21
examples/cluster_dashboards/src/main.rs
Normal file
21
examples/cluster_dashboards/src/main.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(cluster_dashboards_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -14,6 +14,7 @@ async fn main() {
|
||||
..Default::default() // Use harmony defaults, they are based on CNPG's default values :
|
||||
// "default" namespace, 1 instance, 1Gi storage
|
||||
},
|
||||
hostname: "postgrestest.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
|
||||
@@ -15,6 +15,7 @@ async fn main() {
|
||||
..Default::default() // Use harmony defaults, they are based on CNPG's default values :
|
||||
// 1 instance, 1Gi storage
|
||||
},
|
||||
hostname: "postgrestest.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
let test_connection = PostgreSQLConnectionScore {
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
[package]
|
||||
name = "vllm"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
k8s-openapi = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
log = { workspace = true }
|
||||
env_logger = { workspace = true }
|
||||
@@ -1,523 +0,0 @@
|
||||
//! vLLM Deployment Example for Qwen3.5-27B-FP8 on NVIDIA RTX 5090
|
||||
//!
|
||||
//! This example deploys vLLM serving Qwen3.5-27B with FP8 quantization,
|
||||
//! optimized for single RTX 5090 (32GB VRAM) with tool calling support.
|
||||
//!
|
||||
//! # Architecture & Memory Constraints
|
||||
//!
|
||||
//! **Model Details:**
|
||||
//! - Parameters: 27B (dense, not sparse/MoE)
|
||||
//! - Quantization: FP8 (8-bit weights)
|
||||
//! - Model size: ~27-28GB in memory
|
||||
//! - Native context: 262,144 tokens (will NOT fit in 32GB VRAM)
|
||||
//!
|
||||
//! **VRAM Budget for RTX 5090 (32GB):**
|
||||
//! - Model weights (FP8): ~27GB
|
||||
//! - Framework overhead: ~1-2GB
|
||||
//! - KV cache: ~2-3GB (for 16k context)
|
||||
//! - CUDA context: ~500MB
|
||||
//! - Temporary buffers: ~500MB
|
||||
//! - **Total: ~31-33GB** (tight fit, leaves minimal headroom)
|
||||
//!
|
||||
//! # OpenShift/OKD Requirements
|
||||
//!
|
||||
//! **SCC (Security Context Constraint) Setup:**
|
||||
//!
|
||||
//! The official vLLM container runs as root and writes to `/root/.cache/huggingface`.
|
||||
//! On OpenShift/OKD with the default restricted SCC, containers run as arbitrary UIDs
|
||||
//! and cannot write to `/root`. For testing, grant the `anyuid` SCC:
|
||||
//!
|
||||
//! ```bash
|
||||
//! # As cluster admin, grant anyuid SCC to the namespace's service account:
|
||||
//! oc adm policy add-scc-to-user anyuid -z default -n vllm-qwen
|
||||
//! ```
|
||||
//!
|
||||
//! This allows pods in the `vllm-qwen` namespace to run as root (UID 0).
|
||||
//! For production, consider building a custom vLLM image that runs as non-root.
|
||||
//!
|
||||
//! # Critical Configuration Notes
|
||||
//!
|
||||
//! 1. **GPU_MEMORY_UTILIZATION=1.0**: Maximum GPU memory allocation.
|
||||
//! NEVER decrease this for dense models - CPU offloading destroys performance
|
||||
//! (100-1000x slower) for models where every parameter is used during inference.
|
||||
//!
|
||||
//! 2. **MAX_MODEL_LEN=16384**: Conservative context length that fits in available VRAM.
|
||||
//! Agentic workflows with long tool call histories will need careful context management.
|
||||
//!
|
||||
//! 3. **--language-model-only**: Skips loading the vision encoder, saving ~1-2GB VRAM.
|
||||
//! Essential for fitting the model in 32GB VRAM.
|
||||
//!
|
||||
//! 4. **PVC Size**: 50Gi for HuggingFace cache. Qwen3.5-27B-FP8 is ~30GB.
|
||||
//!
|
||||
//! # Performance Expectations
|
||||
//!
|
||||
//! - Single token latency: ~50-100ms (no CPU offloading)
|
||||
//! - With CPU offloading: ~5-50 seconds per token (unusable for real-time inference)
|
||||
//! - Throughput: ~10-20 tokens/second (single stream, no batching)
|
||||
//!
|
||||
//! # Next Steps for Production
|
||||
//!
|
||||
//! To increase context length:
|
||||
//! 1. Monitor GPU memory: `kubectl exec -it deployment/qwen3-5-27b -- nvidia-smi dmon -s u`
|
||||
//! 2. If stable, increase MAX_MODEL_LEN (try 32768, then 65536)
|
||||
//! 3. If OOM: revert to lower value
|
||||
//!
|
||||
//! For full 262k context, consider:
|
||||
//! - Multi-GPU setup with tensor parallelism (--tensor-parallel-size 8)
|
||||
//! - Or use a smaller model (Qwen3.5-7B-FP8)
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{
|
||||
k8s::resource::K8sResourceScore,
|
||||
okd::{
|
||||
crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig},
|
||||
route::OKDRouteScore,
|
||||
},
|
||||
},
|
||||
score::Score,
|
||||
topology::{K8sAnywhereTopology, TlsRouter},
|
||||
};
|
||||
use k8s_openapi::{
|
||||
api::{
|
||||
apps::v1::{Deployment, DeploymentSpec, DeploymentStrategy},
|
||||
core::v1::{
|
||||
Container, ContainerPort, EmptyDirVolumeSource, EnvVar, EnvVarSource,
|
||||
HTTPGetAction, PersistentVolumeClaim, PersistentVolumeClaimSpec,
|
||||
PersistentVolumeClaimVolumeSource, PodSpec, PodTemplateSpec, Probe,
|
||||
ResourceRequirements, Secret, SecretKeySelector, SecretVolumeSource, Service,
|
||||
ServicePort, ServiceSpec, Volume, VolumeMount, VolumeResourceRequirements,
|
||||
},
|
||||
},
|
||||
apimachinery::pkg::{
|
||||
api::resource::Quantity,
|
||||
apis::meta::v1::{LabelSelector, ObjectMeta},
|
||||
util::intstr::IntOrString,
|
||||
},
|
||||
ByteString,
|
||||
};
|
||||
use log::info;
|
||||
|
||||
const NAMESPACE: &str = "vllm-qwen";
|
||||
const MODEL_NAME: &str = "Qwen/Qwen3.5-27B-FP8";
|
||||
const DEPLOYMENT_NAME: &str = "qwen3-5-27b";
|
||||
const SERVICE_NAME: &str = DEPLOYMENT_NAME;
|
||||
const ROUTE_NAME: &str = DEPLOYMENT_NAME;
|
||||
const PVC_NAME: &str = "huggingface-cache";
|
||||
const SECRET_NAME: &str = "hf-token-secret";
|
||||
|
||||
const VLLM_IMAGE: &str = "vllm/vllm-openai:latest";
|
||||
const SERVICE_PORT: u16 = 8000;
|
||||
const TARGET_PORT: u16 = 8000;
|
||||
|
||||
/// Maximum context length for the model (in tokens).
|
||||
///
|
||||
/// **Impact on VRAM:**
|
||||
/// - Qwen3.5-27B uses per-token KV cache storage for the context window
|
||||
/// - Larger context = more KV cache memory required
|
||||
/// - Approximate KV cache per token: ~32KB for FP8 (very rough estimate)
|
||||
/// - 16k tokens ≈ 0.5-1GB KV cache
|
||||
/// - 262k tokens ≈ 8-16GB KV cache (native context length - will NOT fit in 32GB VRAM)
|
||||
///
|
||||
/// **Performance Impact:**
|
||||
/// - Context length directly impacts memory for storing conversation history
|
||||
/// - Agentic workflows with long tool call histories benefit from more context
|
||||
/// - If context > available VRAM, vLLM will OOM and fail to start
|
||||
///
|
||||
/// **Recommendations for RTX 5090 (32GB):**
|
||||
/// - Start with 16384 (conservative, should work)
|
||||
/// - If no OOM, try 32768 (better for agentic workflows)
|
||||
/// - Monitor GPU memory with `nvidia-smi` during operation
|
||||
const MAX_MODEL_LEN: i64 = 16384;
|
||||
|
||||
/// Fraction of GPU memory to allocate for the model (0.0 to 1.0).
|
||||
///
|
||||
/// **CRITICAL WARNING: This is a dense model!**
|
||||
/// Qwen3.5-27B-FP8 is NOT a sparse/mixture-of-experts model. All 27B parameters
|
||||
/// are active during inference. CPU offloading will DESTROY performance.
|
||||
///
|
||||
/// **What this parameter controls:**
|
||||
/// - Controls how much of GPU memory vLLM pre-allocates for:
|
||||
/// 1. Model weights (~27GB for FP8 quantization)
|
||||
/// 2. KV cache for context window
|
||||
/// 3. Activation buffers for inference
|
||||
/// 4. Runtime overhead
|
||||
///
|
||||
/// **VRAM Allocation Example:**
|
||||
/// - GPU: 32GB RTX 5090
|
||||
/// - GPU_MEMORY_UTILIZATION: 0.95
|
||||
/// - vLLM will try to use: 32GB * 0.95 = 30.4GB
|
||||
/// - Model weights: ~27-28GB
|
||||
/// - Remaining for KV cache + runtime: ~2-3GB
|
||||
///
|
||||
/// **If set too LOW (e.g., 0.7):**
|
||||
/// - vLLM restricts itself to 32GB * 0.7 = 22.4GB
|
||||
/// - Model weights alone need ~27GB
|
||||
/// - vLLM will OFFLOAD model weights to CPU memory
|
||||
/// - Performance: **100-1000x slower** (single token generation can take seconds instead of milliseconds)
|
||||
/// - This is catastrophic for a dense model where every layer needs all parameters
|
||||
///
|
||||
/// **If set too HIGH (e.g., 0.99):**
|
||||
/// - vLLM tries to allocate nearly all GPU memory
|
||||
/// - Risk: CUDA OOM if any other process needs GPU memory
|
||||
/// - Risk: KV cache allocation fails during inference
|
||||
/// - System instability
|
||||
///
|
||||
/// **Current Setting: 0.95**
|
||||
/// - Leaves 5% buffer (1.6GB) for CUDA overhead, system processes
|
||||
/// - Maximum allocation for model + KV cache: ~30.4GB
|
||||
/// - Should leave enough headroom for:
|
||||
/// - CUDA context: ~500MB
|
||||
/// - Temporary buffers: ~500MB
|
||||
/// - Safety margin: ~600MB
|
||||
///
|
||||
/// **How to tune:**
|
||||
/// 1. Start with 0.95 (current setting)
|
||||
/// 2. Monitor with `nvidia-smi dmon -s u` during operation
|
||||
/// 3. If OOM during inference: reduce MAX_MODEL_LEN first
|
||||
/// 4. If stable: try increasing MAX_MODEL_LEN before increasing this
|
||||
/// 5. Only increase this if you're certain no other GPU processes run
|
||||
///
|
||||
/// **NEVER decrease this for dense models!**
|
||||
/// If model doesn't fit, use a smaller model or quantization, not CPU offloading.
|
||||
const GPU_MEMORY_UTILIZATION : f32 = 1.0;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::init();
|
||||
|
||||
info!("Deploying vLLM with Qwen3.5-27B-FP8 model");
|
||||
info!("Configuration:");
|
||||
info!(" Model: {}", MODEL_NAME);
|
||||
info!(" Max context length: {} tokens", MAX_MODEL_LEN);
|
||||
info!(" GPU memory utilization: {}", GPU_MEMORY_UTILIZATION);
|
||||
info!(" Language model only: true");
|
||||
info!(" Tool calling enabled: true");
|
||||
|
||||
let topology = K8sAnywhereTopology::from_env();
|
||||
let domain = topology
|
||||
.get_internal_domain()
|
||||
.await
|
||||
.ok()
|
||||
.flatten()
|
||||
.unwrap_or_else(|| "cluster.local".to_string());
|
||||
|
||||
let host = format!("{}-{}.apps.{}", SERVICE_NAME, NAMESPACE, domain);
|
||||
info!("Creating route with host: {}", host);
|
||||
|
||||
let scores: Vec<Box<dyn Score<K8sAnywhereTopology>>> = vec![
|
||||
create_namespace(),
|
||||
create_pvc(),
|
||||
create_secret(),
|
||||
create_deployment(),
|
||||
create_service(),
|
||||
create_route(&host),
|
||||
];
|
||||
|
||||
harmony_cli::run(Inventory::autoload(), topology, scores, None)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to deploy: {}", e))?;
|
||||
|
||||
info!("Successfully deployed vLLM with Qwen3.5-27B-FP8");
|
||||
info!("Access the API at: http://{}.apps.<cluster-domain>", SERVICE_NAME);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_namespace() -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
use k8s_openapi::api::core::v1::Namespace;
|
||||
|
||||
let namespace = Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(NAMESPACE.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: None,
|
||||
status: None,
|
||||
};
|
||||
|
||||
Box::new(K8sResourceScore::single(namespace, None))
|
||||
}
|
||||
|
||||
fn create_pvc() -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
let pvc = PersistentVolumeClaim {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(PVC_NAME.to_string()),
|
||||
namespace: Some(NAMESPACE.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(PersistentVolumeClaimSpec {
|
||||
access_modes: Some(vec!["ReadWriteOnce".to_string()]),
|
||||
resources: Some(VolumeResourceRequirements {
|
||||
requests: Some(BTreeMap::from([(
|
||||
"storage".to_string(),
|
||||
Quantity("50Gi".to_string()),
|
||||
)])),
|
||||
limits: None,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
status: None,
|
||||
};
|
||||
|
||||
Box::new(K8sResourceScore::single(
|
||||
pvc,
|
||||
Some(NAMESPACE.to_string()),
|
||||
))
|
||||
}
|
||||
|
||||
fn create_secret() -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
let mut data = BTreeMap::new();
|
||||
data.insert(
|
||||
"token".to_string(),
|
||||
ByteString("".to_string().into_bytes()),
|
||||
);
|
||||
|
||||
let secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(SECRET_NAME.to_string()),
|
||||
namespace: Some(NAMESPACE.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(data),
|
||||
immutable: Some(false),
|
||||
type_: Some("Opaque".to_string()),
|
||||
string_data: None,
|
||||
};
|
||||
|
||||
Box::new(K8sResourceScore::single(
|
||||
secret,
|
||||
Some(NAMESPACE.to_string()),
|
||||
))
|
||||
}
|
||||
|
||||
fn create_deployment() -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
let deployment = Deployment {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(DEPLOYMENT_NAME.to_string()),
|
||||
namespace: Some(NAMESPACE.to_string()),
|
||||
labels: Some(BTreeMap::from([(
|
||||
"app".to_string(),
|
||||
DEPLOYMENT_NAME.to_string(),
|
||||
)])),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(DeploymentSpec {
|
||||
replicas: Some(1),
|
||||
selector: LabelSelector {
|
||||
match_labels: Some(BTreeMap::from([(
|
||||
"app".to_string(),
|
||||
DEPLOYMENT_NAME.to_string(),
|
||||
)])),
|
||||
..Default::default()
|
||||
},
|
||||
strategy: Some(DeploymentStrategy {
|
||||
type_: Some("Recreate".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
template: PodTemplateSpec {
|
||||
metadata: Some(ObjectMeta {
|
||||
labels: Some(BTreeMap::from([(
|
||||
"app".to_string(),
|
||||
DEPLOYMENT_NAME.to_string(),
|
||||
)])),
|
||||
..Default::default()
|
||||
}),
|
||||
spec: Some(PodSpec {
|
||||
node_selector: Some(BTreeMap::from([(
|
||||
"nvidia.com/gpu.product".to_string(),
|
||||
"NVIDIA-GeForce-RTX-5090".to_string(),
|
||||
)])),
|
||||
volumes: Some(vec![
|
||||
Volume {
|
||||
name: "cache-volume".to_string(),
|
||||
persistent_volume_claim: Some(PersistentVolumeClaimVolumeSource {
|
||||
claim_name: PVC_NAME.to_string(),
|
||||
read_only: Some(false),
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
Volume {
|
||||
name: "shm".to_string(),
|
||||
empty_dir: Some(EmptyDirVolumeSource {
|
||||
medium: Some("Memory".to_string()),
|
||||
size_limit: Some(Quantity("4Gi".to_string())),
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
Volume {
|
||||
name: "hf-token".to_string(),
|
||||
secret: Some(SecretVolumeSource {
|
||||
secret_name: Some(SECRET_NAME.to_string()),
|
||||
optional: Some(true),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
]),
|
||||
containers: vec![Container {
|
||||
name: DEPLOYMENT_NAME.to_string(),
|
||||
image: Some(VLLM_IMAGE.to_string()),
|
||||
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
|
||||
args: Some(vec![build_vllm_command()]),
|
||||
env: Some(vec![
|
||||
EnvVar {
|
||||
name: "HF_TOKEN".to_string(),
|
||||
value_from: Some(EnvVarSource {
|
||||
secret_key_ref: Some(SecretKeySelector {
|
||||
key: "token".to_string(),
|
||||
name: SECRET_NAME.to_string(),
|
||||
optional: Some(true),
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
value: None,
|
||||
},
|
||||
EnvVar {
|
||||
name: "VLLM_WORKER_MULTIPROC_METHOD".to_string(),
|
||||
value: Some("spawn".to_string()),
|
||||
value_from: None,
|
||||
},
|
||||
]),
|
||||
ports: Some(vec![ContainerPort {
|
||||
container_port: SERVICE_PORT as i32,
|
||||
protocol: Some("TCP".to_string()),
|
||||
..Default::default()
|
||||
}]),
|
||||
resources: Some(ResourceRequirements {
|
||||
limits: Some(BTreeMap::from([
|
||||
("cpu".to_string(), Quantity("10".to_string())),
|
||||
("memory".to_string(), Quantity("30Gi".to_string())),
|
||||
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
|
||||
])),
|
||||
requests: Some(BTreeMap::from([
|
||||
("cpu".to_string(), Quantity("2".to_string())),
|
||||
("memory".to_string(), Quantity("10Gi".to_string())),
|
||||
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
|
||||
])),
|
||||
claims: None,
|
||||
}),
|
||||
volume_mounts: Some(vec![
|
||||
VolumeMount {
|
||||
name: "cache-volume".to_string(),
|
||||
mount_path: "/root/.cache/huggingface".to_string(),
|
||||
read_only: Some(false),
|
||||
..Default::default()
|
||||
},
|
||||
VolumeMount {
|
||||
name: "shm".to_string(),
|
||||
mount_path: "/dev/shm".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
VolumeMount {
|
||||
name: "hf-token".to_string(),
|
||||
mount_path: "/etc/secrets/hf-token".to_string(),
|
||||
read_only: Some(true),
|
||||
..Default::default()
|
||||
},
|
||||
]),
|
||||
liveness_probe: Some(Probe {
|
||||
http_get: Some(HTTPGetAction {
|
||||
path: Some("/health".to_string()),
|
||||
port: IntOrString::Int(SERVICE_PORT as i32),
|
||||
..Default::default()
|
||||
}),
|
||||
initial_delay_seconds: Some(300),
|
||||
period_seconds: Some(30),
|
||||
..Default::default()
|
||||
}),
|
||||
readiness_probe: Some(Probe {
|
||||
http_get: Some(HTTPGetAction {
|
||||
path: Some("/health".to_string()),
|
||||
port: IntOrString::Int(SERVICE_PORT as i32),
|
||||
..Default::default()
|
||||
}),
|
||||
initial_delay_seconds: Some(120),
|
||||
period_seconds: Some(10),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}],
|
||||
..Default::default()
|
||||
}),
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
status: None,
|
||||
};
|
||||
|
||||
Box::new(K8sResourceScore::single(
|
||||
deployment,
|
||||
Some(NAMESPACE.to_string()),
|
||||
))
|
||||
}
|
||||
|
||||
fn build_vllm_command() -> String {
|
||||
format!(
|
||||
"vllm serve {} \
|
||||
--port {} \
|
||||
--max-model-len {} \
|
||||
--gpu-memory-utilization {} \
|
||||
--reasoning-parser qwen3 \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--language-model-only",
|
||||
MODEL_NAME, SERVICE_PORT, MAX_MODEL_LEN, GPU_MEMORY_UTILIZATION
|
||||
)
|
||||
}
|
||||
|
||||
fn create_service() -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
let service = Service {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(SERVICE_NAME.to_string()),
|
||||
namespace: Some(NAMESPACE.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(ServiceSpec {
|
||||
ports: Some(vec![ServicePort {
|
||||
name: Some("http".to_string()),
|
||||
port: SERVICE_PORT as i32,
|
||||
protocol: Some("TCP".to_string()),
|
||||
target_port: Some(IntOrString::Int(TARGET_PORT as i32)),
|
||||
..Default::default()
|
||||
}]),
|
||||
selector: Some(BTreeMap::from([(
|
||||
"app".to_string(),
|
||||
DEPLOYMENT_NAME.to_string(),
|
||||
)])),
|
||||
type_: Some("ClusterIP".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
status: None,
|
||||
};
|
||||
|
||||
Box::new(K8sResourceScore::single(
|
||||
service,
|
||||
Some(NAMESPACE.to_string()),
|
||||
))
|
||||
}
|
||||
|
||||
fn create_route(host: &str) -> Box<dyn Score<K8sAnywhereTopology>> {
|
||||
let route_spec = RouteSpec {
|
||||
to: RouteTargetReference {
|
||||
kind: "Service".to_string(),
|
||||
name: SERVICE_NAME.to_string(),
|
||||
weight: Some(100),
|
||||
},
|
||||
host: Some(host.to_string()),
|
||||
port: Some(RoutePort {
|
||||
target_port: SERVICE_PORT as u16,
|
||||
}),
|
||||
tls: Some(TLSConfig {
|
||||
termination: "edge".to_string(),
|
||||
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
wildcard_policy: None,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Box::new(OKDRouteScore::new(ROUTE_NAME, NAMESPACE, route_spec))
|
||||
}
|
||||
@@ -109,13 +109,6 @@ impl K8sclient for K8sAnywhereTopology {
|
||||
|
||||
#[async_trait]
|
||||
impl TlsRouter for K8sAnywhereTopology {
|
||||
async fn get_public_domain(&self) -> Result<String, String> {
|
||||
match &self.config.public_domain {
|
||||
Some(public_domain) => Ok(public_domain.to_string()),
|
||||
None => Err("Public domain not available".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_internal_domain(&self) -> Result<Option<String>, String> {
|
||||
match self.get_k8s_distribution().await.map_err(|e| {
|
||||
format!(
|
||||
@@ -1131,7 +1124,6 @@ pub struct K8sAnywhereConfig {
|
||||
///
|
||||
/// If the context name is not found, it will fail to initialize.
|
||||
pub k8s_context: Option<String>,
|
||||
public_domain: Option<String>,
|
||||
}
|
||||
|
||||
impl K8sAnywhereConfig {
|
||||
@@ -1159,7 +1151,6 @@ impl K8sAnywhereConfig {
|
||||
|
||||
let mut kubeconfig: Option<String> = None;
|
||||
let mut k8s_context: Option<String> = None;
|
||||
let mut public_domain: Option<String> = None;
|
||||
|
||||
for part in env_var_value.split(',') {
|
||||
let kv: Vec<&str> = part.splitn(2, '=').collect();
|
||||
@@ -1167,7 +1158,6 @@ impl K8sAnywhereConfig {
|
||||
match kv[0].trim() {
|
||||
"kubeconfig" => kubeconfig = Some(kv[1].trim().to_string()),
|
||||
"context" => k8s_context = Some(kv[1].trim().to_string()),
|
||||
"public_domain" => public_domain = Some(kv[1].trim().to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
@@ -1185,7 +1175,6 @@ impl K8sAnywhereConfig {
|
||||
K8sAnywhereConfig {
|
||||
kubeconfig,
|
||||
k8s_context,
|
||||
public_domain,
|
||||
use_system_kubeconfig,
|
||||
autoinstall: false,
|
||||
use_local_k3d: false,
|
||||
@@ -1228,7 +1217,6 @@ impl K8sAnywhereConfig {
|
||||
use_local_k3d: std::env::var("HARMONY_USE_LOCAL_K3D")
|
||||
.map_or_else(|_| true, |v| v.parse().ok().unwrap_or(true)),
|
||||
k8s_context: std::env::var("HARMONY_K8S_CONTEXT").ok(),
|
||||
public_domain: std::env::var("HARMONY_PUBLIC_DOMAIN").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,6 +122,4 @@ pub trait TlsRouter: Send + Sync {
|
||||
|
||||
/// Returns the port that this router exposes externally.
|
||||
async fn get_router_port(&self) -> u16;
|
||||
|
||||
async fn get_public_domain(&self) -> Result<String, String>;
|
||||
}
|
||||
|
||||
@@ -267,16 +267,10 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
SSL::Default => "".into(),
|
||||
SSL::Other(other) => other.as_str().into(),
|
||||
};
|
||||
let path_without_query = path.split_once('?').map_or(path.as_str(), |(p, _)| p);
|
||||
let (port, port_name) = match port {
|
||||
Some(port) => (Some(port.to_string()), port.to_string()),
|
||||
None => (None, "serverport".to_string()),
|
||||
};
|
||||
|
||||
let haproxy_check = HAProxyHealthCheck {
|
||||
name: format!("HTTP_{http_method}_{path_without_query}_{port_name}"),
|
||||
name: format!("HTTP_{http_method}_{path}"),
|
||||
uuid: Uuid::new_v4().to_string(),
|
||||
http_method: http_method.to_string().to_lowercase().into(),
|
||||
http_method: http_method.to_string().into(),
|
||||
health_check_type: "http".to_string(),
|
||||
http_uri: path.clone().into(),
|
||||
interval: "2s".to_string(),
|
||||
@@ -320,10 +314,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
let mut backend = HAProxyBackend {
|
||||
uuid: Uuid::new_v4().to_string(),
|
||||
enabled: 1,
|
||||
name: format!(
|
||||
"backend_{}",
|
||||
service.listening_port.to_string().replace(':', "_")
|
||||
),
|
||||
name: format!("backend_{}", service.listening_port),
|
||||
algorithm: "roundrobin".to_string(),
|
||||
random_draws: Some(2),
|
||||
stickiness_expire: "30m".to_string(),
|
||||
@@ -355,22 +346,10 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
let frontend = Frontend {
|
||||
uuid: uuid::Uuid::new_v4().to_string(),
|
||||
enabled: 1,
|
||||
name: format!(
|
||||
"frontend_{}",
|
||||
service.listening_port.to_string().replace(':', "_")
|
||||
),
|
||||
name: format!("frontend_{}", service.listening_port),
|
||||
bind: service.listening_port.to_string(),
|
||||
mode: "tcp".to_string(), // TODO do not depend on health check here
|
||||
default_backend: Some(backend.uuid.clone()),
|
||||
stickiness_expire: "30m".to_string().into(),
|
||||
stickiness_size: "50k".to_string().into(),
|
||||
stickiness_conn_rate_period: "10s".to_string().into(),
|
||||
stickiness_sess_rate_period: "10s".to_string().into(),
|
||||
stickiness_http_req_rate_period: "10s".to_string().into(),
|
||||
stickiness_http_err_rate_period: "10s".to_string().into(),
|
||||
stickiness_bytes_in_rate_period: "1m".to_string().into(),
|
||||
stickiness_bytes_out_rate_period: "1m".to_string().into(),
|
||||
ssl_hsts_max_age: 15768000,
|
||||
..Default::default()
|
||||
};
|
||||
info!("HAPRoxy frontend and backend mode currently hardcoded to tcp");
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::fs::{self};
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -64,7 +65,6 @@ pub struct RustWebapp {
|
||||
///
|
||||
/// This is the place to put the public host name if this is a public facing webapp.
|
||||
pub dns: String,
|
||||
pub version: String,
|
||||
}
|
||||
|
||||
impl Application for RustWebapp {
|
||||
@@ -465,7 +465,6 @@ impl RustWebapp {
|
||||
|
||||
let app_name = &self.name;
|
||||
let service_port = self.service_port;
|
||||
let version = &self.version;
|
||||
// Create Chart.yaml
|
||||
let chart_yaml = format!(
|
||||
r#"
|
||||
@@ -473,7 +472,7 @@ apiVersion: v2
|
||||
name: {chart_name}
|
||||
description: A Helm chart for the {app_name} web application.
|
||||
type: application
|
||||
version: {version}
|
||||
version: 0.2.1
|
||||
appVersion: "{image_tag}"
|
||||
"#,
|
||||
);
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access
|
||||
rules:
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- prometheuses/api
|
||||
verbs:
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-prometheus-api-access
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-cluster-monitoring-view
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-monitoring-view
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: observability
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
serviceAccountName: cluster-grafana-sa
|
||||
automountServiceAccountToken: true
|
||||
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
|
||||
security:
|
||||
admin_user: admin
|
||||
admin_password: paul
|
||||
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
|
||||
auth.anonymous:
|
||||
enabled: "true"
|
||||
org_role: Viewer
|
||||
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-prometheus-token
|
||||
namespace: observability
|
||||
annotations:
|
||||
kubernetes.io/service-account.name: cluster-grafana-sa
|
||||
type: kubernetes.io/service-account-token
|
||||
@@ -0,0 +1,27 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "Bearer ${token}"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: observability
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: cluster-grafana-service
|
||||
port:
|
||||
targetPort: 3000
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
@@ -0,0 +1,97 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster CPU Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster Memory Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||||
}
|
||||
]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,769 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,637 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-node-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,783 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-workload-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,955 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-networking
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,607 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: storage-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,744 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-etcd
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,752 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-control-plane-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,741 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-alerts-events
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
# These are probably already created by rook-ceph operator, not sure, needs to validate.
|
||||
# in fact, 100% sure for the second one (rook-ceph-exporter)
|
||||
# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working
|
||||
# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-mgr
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This specific label is what tells OKD's Prometheus to pick this up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
# This matches your 'rook-ceph-mgr' service
|
||||
app: rook-ceph-mgr
|
||||
endpoints:
|
||||
- port: ""
|
||||
# The port name in your service is empty/integers, so we use targetPort
|
||||
targetPort: 9283
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-exporter
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This label is required for OKD cluster-wide monitoring to pick it up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
team: rook
|
||||
spec:
|
||||
endpoints:
|
||||
- honorLabels: true
|
||||
interval: 10s
|
||||
path: /metrics
|
||||
port: ceph-exporter-http-metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rook-ceph-exporter
|
||||
rook_cluster: rook-ceph
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints", "pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: rook-ceph-metrics-viewer
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: openshift-monitoring
|
||||
@@ -0,0 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: rook-ceph
|
||||
labels:
|
||||
# This is the critical label that allows OKD Prometheus to see the namespace
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,731 @@
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,739 @@
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,742 @@
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,734 @@
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,945 @@
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,627 @@
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,596 @@
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,773 @@
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
mod score;
|
||||
pub use score::ClusterDashboardsScore;
|
||||
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
@@ -0,0 +1,507 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::{Namespace, Secret};
|
||||
use kube::{api::ObjectMeta, api::DynamicObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_yaml;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
modules::okd::crd::route::Route,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ClusterDashboardsScore {
|
||||
pub namespace: String,
|
||||
pub grafana_admin_user: String,
|
||||
pub grafana_admin_password: String,
|
||||
}
|
||||
|
||||
impl Default for ClusterDashboardsScore {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
namespace: "harmony-observability".to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsScore {
|
||||
pub fn new(namespace: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_credentials(namespace: &str, admin_user: &str, admin_password: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: admin_user.to_string(),
|
||||
grafana_admin_password: admin_password.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for ClusterDashboardsScore {
|
||||
fn name(&self) -> String {
|
||||
format!("ClusterDashboardsScore({})", self.namespace)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(ClusterDashboardsInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
grafana_admin_user: self.grafana_admin_user.clone(),
|
||||
grafana_admin_password: self.grafana_admin_password.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterDashboardsInterpret {
|
||||
namespace: String,
|
||||
grafana_admin_user: String,
|
||||
grafana_admin_password: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
self.create_namespace(inventory, topology).await?;
|
||||
self.create_rbac_resources(inventory, topology).await?;
|
||||
self.create_secret(inventory, topology).await?;
|
||||
self.create_grafana(inventory, topology).await?;
|
||||
self.create_datasource(inventory, topology).await?;
|
||||
self.create_dashboards(inventory, topology).await?;
|
||||
self.create_route(inventory, topology).await?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||
self.namespace,
|
||||
8
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsInterpret {
|
||||
async fn create_namespace(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert(
|
||||
"openshift.io/cluster-monitoring".to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
|
||||
let namespace = Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..Namespace::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(namespace, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_rbac_resources(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "cluster-grafana-sa".to_string();
|
||||
let rbac_namespace = self.namespace.clone();
|
||||
|
||||
let service_account = {
|
||||
use k8s_openapi::api::core::v1::ServiceAccount;
|
||||
ServiceAccount {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(service_account_name.clone()),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..ServiceAccount::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRole, PolicyRule};
|
||||
ClusterRole {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["monitoring.coreos.com".to_string()]),
|
||||
resources: Some(vec!["prometheuses/api".to_string()]),
|
||||
verbs: vec!["get".to_string()],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..ClusterRole::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access-binding".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "grafana-prometheus-api-access".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding_cluster_monitoring = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-cluster-monitoring-view".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "cluster-monitoring-view".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
K8sResourceScore::single(service_account, Some(rbac_namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding_cluster_monitoring, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_secret(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "cluster-grafana-sa".to_string();
|
||||
let secret_name = "grafana-prometheus-token".to_string();
|
||||
let secret_namespace = self.namespace.clone();
|
||||
|
||||
let secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(secret_name),
|
||||
namespace: Some(secret_namespace),
|
||||
annotations: Some({
|
||||
let mut ann = BTreeMap::new();
|
||||
ann.insert(
|
||||
"kubernetes.io/service-account.name".to_string(),
|
||||
service_account_name,
|
||||
);
|
||||
ann
|
||||
}),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
type_: Some("kubernetes.io/service-account-token".to_string()),
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(secret, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_grafana(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> = vec![
|
||||
("dashboards".to_string(), "grafana".to_string()),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let mut annotations = BTreeMap::new();
|
||||
annotations.insert(
|
||||
"kubectl.kubernetes.io/last-applied-configuration".to_string(),
|
||||
"".to_string(),
|
||||
);
|
||||
|
||||
let grafana_yaml = format!(r#"
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: {}
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
security:
|
||||
admin_user: {}
|
||||
admin_password: {}
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
"auth.anonymous":
|
||||
enabled: "true"
|
||||
org_role: "Viewer"
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 2Gi
|
||||
"#, self.namespace, self.grafana_admin_user, self.grafana_admin_password);
|
||||
|
||||
let grafana_value: serde_json::Value = serde_yaml::from_str(grafana_yaml.as_str())
|
||||
.map_err(|e| InterpretError::new(format!("Failed to parse Grafana YAML: {e}")))?;
|
||||
|
||||
let grafana: DynamicObject = serde_json::from_value(grafana_value)
|
||||
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||
|
||||
client.apply_dynamic(&grafana, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Grafana: {e}")))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_datasource(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> = vec![
|
||||
("datasource".to_string(), "prometheus".to_string()),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let secure_json_data_value = "Bearer ${token}";
|
||||
|
||||
let datasource_yaml = format!(r#"
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: {}
|
||||
labels:
|
||||
datasource: "prometheus"
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "{}"
|
||||
"#, self.namespace, secure_json_data_value);
|
||||
|
||||
let datasource_value: serde_json::Value = serde_yaml::from_str(datasource_yaml.as_str())
|
||||
.map_err(|e| InterpretError::new(format!("Failed to parse Datasource YAML: {e}")))?;
|
||||
|
||||
let datasource: DynamicObject = serde_json::from_value(datasource_value)
|
||||
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||
|
||||
client.apply_dynamic(&datasource, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Datasource: {e}")))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_dashboards(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let dashboards: &[(&str, &str)] = &[
|
||||
("okd-cluster-overview", include_str!("dashboards/cluster-overview.json")),
|
||||
("okd-node-health", include_str!("dashboards/nodes-health.json")),
|
||||
("okd-workload-health", include_str!("dashboards/workloads-health.json")),
|
||||
("okd-networking", include_str!("dashboards/networking.json")),
|
||||
("storage-health", include_str!("dashboards/storage.json")),
|
||||
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||
("okd-control-plane", include_str!("dashboards/control-plane.json")),
|
||||
("okd-alerts-events", include_str!("dashboards/alerts-events-problems.json")),
|
||||
];
|
||||
|
||||
for (dashboard_name, json_content) in dashboards {
|
||||
let dashboard: DynamicObject = serde_json::from_value(serde_json::json!({
|
||||
"apiVersion": "grafana.integreatly.org/v1beta1",
|
||||
"kind": "GrafanaDashboard",
|
||||
"metadata": {
|
||||
"name": dashboard_name,
|
||||
"namespace": self.namespace,
|
||||
"labels": {
|
||||
"dashboard": dashboard_name
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"instanceSelector": {
|
||||
"matchLabels": {
|
||||
"dashboards": "grafana"
|
||||
}
|
||||
},
|
||||
"json": json_content
|
||||
}
|
||||
})).map_err(|e| InterpretError::new(format!("Failed to create Dashboard {} DynamicObject: {e}", dashboard_name)))?;
|
||||
|
||||
client.apply_dynamic(&dashboard, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Dashboard {}: {e}", dashboard_name)))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_route(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let route = Route {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana".to_string()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: crate::modules::okd::crd::route::RouteSpec {
|
||||
to: crate::modules::okd::crd::route::RouteTargetReference {
|
||||
kind: "Service".to_string(),
|
||||
name: "cluster-grafana-service".to_string(),
|
||||
weight: None,
|
||||
},
|
||||
port: Some(crate::modules::okd::crd::route::RoutePort {
|
||||
target_port: 3000,
|
||||
}),
|
||||
tls: Some(crate::modules::okd::crd::route::TLSConfig {
|
||||
termination: "edge".to_string(),
|
||||
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
||||
..crate::modules::okd::crd::route::TLSConfig::default()
|
||||
}),
|
||||
..crate::modules::okd::crd::route::RouteSpec::default()
|
||||
},
|
||||
..crate::modules::okd::crd::route::Route::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(route, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod application_monitoring;
|
||||
pub mod cluster_dashboards;
|
||||
pub mod grafana;
|
||||
pub mod kube_prometheus;
|
||||
pub mod ntfy;
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio_retry::{Retry, strategy::ExponentialBackoff};
|
||||
|
||||
use crate::modules::{
|
||||
cert_manager::{
|
||||
capability::{CertificateManagement, CertificateManagementConfig},
|
||||
@@ -73,28 +69,9 @@ where
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
let strategy = ExponentialBackoff::from_millis(250)
|
||||
.factor(2)
|
||||
.max_delay(Duration::from_millis(1000))
|
||||
.take(10);
|
||||
|
||||
Retry::spawn(strategy, || async {
|
||||
log::debug!("Attempting CA cert fetch");
|
||||
|
||||
let res = self
|
||||
.topology
|
||||
.get_ca_certificate(root_ca_cert_name.into(), &root_ca_config)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(cert) => Ok(cert),
|
||||
Err(e) => {
|
||||
log::warn!("Retryable error: {:?}", e);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| format!("Retries exhausted: {:?}", e))
|
||||
self.topology
|
||||
.get_ca_certificate(root_ca_cert_name.into(), &root_ca_config)
|
||||
.await
|
||||
.map_err(|e| e.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,20 +4,7 @@ use log::warn;
|
||||
use crate::topology::{FailoverTopology, TlsRoute, TlsRouter};
|
||||
|
||||
#[async_trait]
|
||||
impl<T: TlsRouter + Send> TlsRouter for FailoverTopology<T> {
|
||||
async fn get_public_domain(&self) -> Result<String, String> {
|
||||
/*
|
||||
let primary_domain = self
|
||||
.primary
|
||||
.get_public_domain()
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
Ok(primary_domain)
|
||||
*/
|
||||
todo!()
|
||||
}
|
||||
|
||||
impl<T: TlsRouter> TlsRouter for FailoverTopology<T> {
|
||||
async fn get_internal_domain(&self) -> Result<Option<String>, String> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::scheduling::v1::PriorityClass;
|
||||
use k8s_openapi::api::{
|
||||
apps::v1::{DaemonSet, DaemonSetSpec},
|
||||
core::v1::{
|
||||
@@ -145,19 +144,6 @@ impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||
},
|
||||
};
|
||||
|
||||
// PriorityClass
|
||||
let priority_class_name = "node-healthcheck-critical".to_string();
|
||||
let priority_class = PriorityClass {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(priority_class_name.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
value: 1000000000,
|
||||
global_default: Some(false),
|
||||
preemption_policy: Some("PreemptLowerPriority".to_string()),
|
||||
description: Some("Highest priority for node health check daemonset - can preempt lower priority pods".to_string()),
|
||||
};
|
||||
|
||||
// DaemonSet
|
||||
let mut daemonset_labels = BTreeMap::new();
|
||||
daemonset_labels.insert("app".to_string(), "node-healthcheck".to_string());
|
||||
@@ -182,7 +168,6 @@ impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||
spec: Some(PodSpec {
|
||||
service_account_name: Some(service_account_name.clone()),
|
||||
host_network: Some(true),
|
||||
priority_class_name: Some(priority_class_name),
|
||||
tolerations: Some(vec![Toleration {
|
||||
operator: Some("Exists".to_string()),
|
||||
..Toleration::default()
|
||||
@@ -197,7 +182,6 @@ impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||
name: "NODE_NAME".to_string(),
|
||||
value_from: Some(EnvVarSource {
|
||||
field_ref: Some(ObjectFieldSelector {
|
||||
api_version: Some("v1".to_string()),
|
||||
field_path: "spec.nodeName".to_string(),
|
||||
..ObjectFieldSelector::default()
|
||||
}),
|
||||
@@ -249,9 +233,6 @@ impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||
K8sResourceScore::single(cluster_role_binding, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(priority_class, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(daemon_set, Some(namespace_name.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
@@ -37,7 +37,6 @@ pub struct PostgreSQLConfig {
|
||||
/// settings incompatible with the default CNPG behavior.
|
||||
pub namespace: String,
|
||||
}
|
||||
|
||||
impl PostgreSQLConfig {
|
||||
pub fn with_namespace(&self, namespace: &str) -> PostgreSQLConfig {
|
||||
let mut new = self.clone();
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use kube::{CustomResource, api::ObjectMeta};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -20,10 +19,6 @@ pub struct ClusterSpec {
|
||||
pub image_name: Option<String>,
|
||||
pub storage: Storage,
|
||||
pub bootstrap: Bootstrap,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub external_clusters: Option<Vec<ExternalCluster>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub replica: Option<ReplicaSpec>,
|
||||
/// This must be set to None if you want cnpg to generate a superuser secret
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub superuser_secret: Option<BTreeMap<String, String>>,
|
||||
@@ -46,8 +41,6 @@ impl Default for ClusterSpec {
|
||||
image_name: None,
|
||||
storage: Storage::default(),
|
||||
bootstrap: Bootstrap::default(),
|
||||
external_clusters: None,
|
||||
replica: None,
|
||||
superuser_secret: None,
|
||||
enable_superuser_access: false,
|
||||
}
|
||||
@@ -63,13 +56,7 @@ pub struct Storage {
|
||||
#[derive(Deserialize, Serialize, Clone, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Bootstrap {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub initdb: Option<Initdb>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub recovery: Option<Recovery>,
|
||||
#[serde(rename = "pg_basebackup")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub pg_basebackup: Option<PgBaseBackup>,
|
||||
pub initdb: Initdb,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug, Default)]
|
||||
@@ -78,50 +65,3 @@ pub struct Initdb {
|
||||
pub database: String,
|
||||
pub owner: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Recovery {
|
||||
pub source: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
pub struct PgBaseBackup {
|
||||
pub source: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ExternalCluster {
|
||||
pub name: String,
|
||||
pub connection_parameters: HashMap<String, String>,
|
||||
pub ssl_key: Option<SecretKeySelector>,
|
||||
pub ssl_cert: Option<SecretKeySelector>,
|
||||
pub ssl_root_cert: Option<SecretKeySelector>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ConnectionParameters {
|
||||
pub host: String,
|
||||
pub user: String,
|
||||
pub dbname: String,
|
||||
pub sslmode: String,
|
||||
pub sslnegotiation: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ReplicaSpec {
|
||||
pub enabled: bool,
|
||||
pub source: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SecretKeySelector {
|
||||
pub name: String,
|
||||
pub key: String,
|
||||
}
|
||||
|
||||
@@ -3,8 +3,6 @@ use log::debug;
|
||||
use log::info;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::interpret::InterpretError;
|
||||
use crate::topology::TlsRoute;
|
||||
use crate::topology::TlsRouter;
|
||||
use crate::{
|
||||
modules::postgresql::capability::{
|
||||
@@ -51,18 +49,8 @@ impl<T: PostgreSQL + TlsRouter> PostgreSQL for FailoverTopology<T> {
|
||||
// TODO we should be getting the public endpoint for a service by calling a method on
|
||||
// TlsRouter capability.
|
||||
// Something along the lines of `TlsRouter::get_hostname_for_service(...).await?;`
|
||||
let host = format!(
|
||||
"{}.{}.{}",
|
||||
config.cluster_name,
|
||||
config.namespace,
|
||||
self.primary
|
||||
.get_public_domain()
|
||||
.await
|
||||
.expect("failed to retrieve public domain")
|
||||
.to_string()
|
||||
);
|
||||
let endpoint = PostgreSQLEndpoint {
|
||||
host,
|
||||
host: "postgrestest.sto1.nationtech.io".to_string(),
|
||||
port: self.primary.get_router_port().await,
|
||||
};
|
||||
|
||||
@@ -71,46 +59,6 @@ impl<T: PostgreSQL + TlsRouter> PostgreSQL for FailoverTopology<T> {
|
||||
endpoint.host, endpoint.port
|
||||
);
|
||||
|
||||
info!("installing primary postgres route");
|
||||
let prim_hostname = format!(
|
||||
"{}.{}.{}",
|
||||
config.cluster_name,
|
||||
config.namespace,
|
||||
self.primary.get_public_domain().await?
|
||||
);
|
||||
let rw_backend = format!("{}-rw", config.cluster_name);
|
||||
let tls_route = TlsRoute {
|
||||
hostname: prim_hostname,
|
||||
backend: rw_backend,
|
||||
target_port: 5432,
|
||||
namespace: config.namespace.clone(),
|
||||
};
|
||||
// Expose RW publicly via TLS passthrough
|
||||
self.primary
|
||||
.install_route(tls_route.clone())
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e))?;
|
||||
|
||||
info!("installing replica postgres route");
|
||||
let rep_hostname = format!(
|
||||
"{}.{}.{}",
|
||||
config.cluster_name,
|
||||
config.namespace,
|
||||
self.replica.get_public_domain().await?
|
||||
);
|
||||
let rw_backend = format!("{}-rw", config.cluster_name);
|
||||
let tls_route = TlsRoute {
|
||||
hostname: rep_hostname,
|
||||
backend: rw_backend,
|
||||
target_port: 5432,
|
||||
namespace: config.namespace.clone(),
|
||||
};
|
||||
|
||||
// Expose RW publicly via TLS passthrough
|
||||
self.replica
|
||||
.install_route(tls_route.clone())
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e))?;
|
||||
info!("Configuring replica connection parameters and bootstrap");
|
||||
|
||||
let mut connection_parameters = HashMap::new();
|
||||
|
||||
@@ -1,21 +1,14 @@
|
||||
use crate::data::Version;
|
||||
use crate::interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome};
|
||||
use crate::inventory::Inventory;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::interpret::Interpret;
|
||||
use crate::modules::k8s::resource::K8sResourceScore;
|
||||
use crate::modules::postgresql::capability::PostgreSQLConfig;
|
||||
use crate::modules::postgresql::cnpg::{
|
||||
Bootstrap, Cluster, ClusterSpec, ExternalCluster, Initdb, PgBaseBackup, ReplicaSpec,
|
||||
SecretKeySelector, Storage,
|
||||
};
|
||||
use crate::modules::postgresql::cnpg::{Bootstrap, Cluster, ClusterSpec, Initdb, Storage};
|
||||
use crate::score::Score;
|
||||
use crate::topology::{K8sclient, Topology};
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::ByteString;
|
||||
use k8s_openapi::api::core::v1::Secret;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use serde::Serialize;
|
||||
|
||||
/// Deploys an opinionated, highly available PostgreSQL cluster managed by CNPG.
|
||||
///
|
||||
@@ -58,184 +51,37 @@ impl K8sPostgreSQLScore {
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for K8sPostgreSQLScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(K8sPostgreSQLInterpret {
|
||||
config: self.config.clone(),
|
||||
})
|
||||
let metadata = ObjectMeta {
|
||||
name: Some(self.config.cluster_name.clone()),
|
||||
namespace: Some(self.config.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
};
|
||||
|
||||
let spec = ClusterSpec {
|
||||
instances: self.config.instances,
|
||||
storage: Storage {
|
||||
size: self.config.storage_size.to_string(),
|
||||
},
|
||||
bootstrap: Bootstrap {
|
||||
initdb: Initdb {
|
||||
database: "app".to_string(),
|
||||
owner: "app".to_string(),
|
||||
},
|
||||
},
|
||||
// superuser_secret: Some(BTreeMap::from([(
|
||||
// "name".to_string(),
|
||||
// format!("{}-superuser", self.config.cluster_name.clone()),
|
||||
// )])),
|
||||
enable_superuser_access: true,
|
||||
..ClusterSpec::default()
|
||||
};
|
||||
|
||||
let cluster = Cluster { metadata, spec };
|
||||
|
||||
K8sResourceScore::single(cluster, Some(self.config.namespace.clone())).create_interpret()
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
format!("PostgreSQLScore({})", self.config.namespace)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct K8sPostgreSQLInterpret {
|
||||
config: PostgreSQLConfig,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for K8sPostgreSQLInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
match &self.config.role {
|
||||
super::capability::PostgreSQLClusterRole::Primary => {
|
||||
let metadata = ObjectMeta {
|
||||
name: Some(self.config.cluster_name.clone()),
|
||||
namespace: Some(self.config.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
};
|
||||
|
||||
let spec = ClusterSpec {
|
||||
instances: self.config.instances,
|
||||
storage: Storage {
|
||||
size: self.config.storage_size.to_string(),
|
||||
},
|
||||
bootstrap: Bootstrap {
|
||||
initdb: Some(Initdb {
|
||||
database: "app".to_string(),
|
||||
owner: "app".to_string(),
|
||||
}),
|
||||
recovery: None,
|
||||
pg_basebackup: None,
|
||||
},
|
||||
enable_superuser_access: true,
|
||||
..ClusterSpec::default()
|
||||
};
|
||||
let cluster = Cluster { metadata, spec };
|
||||
|
||||
Ok(
|
||||
K8sResourceScore::single(cluster, Some(self.config.namespace.clone()))
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
super::capability::PostgreSQLClusterRole::Replica(replica_config) => {
|
||||
let metadata = ObjectMeta {
|
||||
name: Some("streaming-replica-certs".to_string()),
|
||||
namespace: Some(self.config.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
};
|
||||
|
||||
// The data must be base64-encoded. If you already have PEM strings in your config, encode them:
|
||||
let mut data = std::collections::BTreeMap::new();
|
||||
data.insert(
|
||||
"tls.key".to_string(),
|
||||
ByteString(
|
||||
replica_config
|
||||
.replication_certs
|
||||
.streaming_replica_key_pem
|
||||
.as_bytes()
|
||||
.to_vec(),
|
||||
),
|
||||
);
|
||||
data.insert(
|
||||
"tls.crt".to_string(),
|
||||
ByteString(
|
||||
replica_config
|
||||
.replication_certs
|
||||
.streaming_replica_cert_pem
|
||||
.as_bytes()
|
||||
.to_vec(),
|
||||
),
|
||||
);
|
||||
data.insert(
|
||||
"ca.crt".to_string(),
|
||||
ByteString(
|
||||
replica_config
|
||||
.replication_certs
|
||||
.ca_cert_pem
|
||||
.as_bytes()
|
||||
.to_vec(),
|
||||
),
|
||||
);
|
||||
|
||||
let secret = Secret {
|
||||
metadata,
|
||||
data: Some(data),
|
||||
string_data: None, // You could use string_data if you prefer raw strings
|
||||
type_: Some("Opaque".to_string()),
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(secret, Some(self.config.namespace.clone()))
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await?;
|
||||
|
||||
let metadata = ObjectMeta {
|
||||
name: Some(self.config.cluster_name.clone()),
|
||||
namespace: Some(self.config.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
};
|
||||
|
||||
let spec = ClusterSpec {
|
||||
instances: self.config.instances,
|
||||
storage: Storage {
|
||||
size: self.config.storage_size.to_string(),
|
||||
},
|
||||
bootstrap: Bootstrap {
|
||||
initdb: None,
|
||||
recovery: None,
|
||||
pg_basebackup: Some(PgBaseBackup {
|
||||
source: replica_config.primary_cluster_name.clone(),
|
||||
}),
|
||||
},
|
||||
external_clusters: Some(vec![ExternalCluster {
|
||||
name: replica_config.primary_cluster_name.clone(),
|
||||
connection_parameters: replica_config
|
||||
.external_cluster
|
||||
.connection_parameters
|
||||
.clone(),
|
||||
ssl_key: Some(SecretKeySelector {
|
||||
name: "streaming-replica-certs".to_string(),
|
||||
key: "tls.key".to_string(),
|
||||
}),
|
||||
ssl_cert: Some(SecretKeySelector {
|
||||
name: "streaming-replica-certs".to_string(),
|
||||
key: "tls.crt".to_string(),
|
||||
}),
|
||||
ssl_root_cert: Some(SecretKeySelector {
|
||||
name: "streaming-replica-certs".to_string(),
|
||||
key: "ca.crt".to_string(),
|
||||
}),
|
||||
}]),
|
||||
replica: Some(ReplicaSpec {
|
||||
enabled: true,
|
||||
source: replica_config.primary_cluster_name.clone(),
|
||||
primary: None,
|
||||
}),
|
||||
..ClusterSpec::default()
|
||||
};
|
||||
|
||||
let cluster = Cluster { metadata, spec };
|
||||
|
||||
Ok(
|
||||
K8sResourceScore::single(cluster, Some(self.config.namespace.clone()))
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("K8sPostgreSQLInterpret")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,31 +18,46 @@ use crate::topology::Topology;
|
||||
/// # Usage
|
||||
/// ```
|
||||
/// use harmony::modules::postgresql::PublicPostgreSQLScore;
|
||||
/// let score = PublicPostgreSQLScore::new("harmony");
|
||||
/// let score = PublicPostgreSQLScore::new("harmony", "pg-rw.example.com");
|
||||
/// ```
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct PublicPostgreSQLScore {
|
||||
/// Inner non-public Postgres cluster config.
|
||||
pub config: PostgreSQLConfig,
|
||||
/// Public hostname for RW TLS passthrough (port 443 → cluster-rw:5432).
|
||||
pub hostname: String,
|
||||
}
|
||||
|
||||
impl PublicPostgreSQLScore {
|
||||
pub fn new(namespace: &str) -> Self {
|
||||
pub fn new(namespace: &str, hostname: &str) -> Self {
|
||||
Self {
|
||||
config: PostgreSQLConfig::default().with_namespace(namespace),
|
||||
hostname: hostname.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + PostgreSQL + TlsRouter> Score<T> for PublicPostgreSQLScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
let rw_backend = format!("{}-rw", self.config.cluster_name);
|
||||
let tls_route = TlsRoute {
|
||||
namespace: self.config.namespace.clone(),
|
||||
hostname: self.hostname.clone(),
|
||||
backend: rw_backend,
|
||||
target_port: 5432,
|
||||
};
|
||||
|
||||
Box::new(PublicPostgreSQLInterpret {
|
||||
config: self.config.clone(),
|
||||
tls_route,
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
format!("PublicPostgreSQLScore({})", self.config.namespace)
|
||||
format!(
|
||||
"PublicPostgreSQLScore({}:{})",
|
||||
self.config.namespace, self.hostname
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +65,7 @@ impl<T: Topology + PostgreSQL + TlsRouter> Score<T> for PublicPostgreSQLScore {
|
||||
#[derive(Debug, Clone)]
|
||||
struct PublicPostgreSQLInterpret {
|
||||
config: PostgreSQLConfig,
|
||||
tls_route: TlsRoute,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -60,28 +76,15 @@ impl<T: Topology + PostgreSQL + TlsRouter> Interpret<T> for PublicPostgreSQLInte
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e))?;
|
||||
|
||||
let hostname = format!(
|
||||
"{}.{}.{}",
|
||||
self.config.cluster_name,
|
||||
self.config.namespace,
|
||||
topo.get_public_domain().await?
|
||||
);
|
||||
let rw_backend = format!("{}-rw", self.config.cluster_name);
|
||||
let tls_route = TlsRoute {
|
||||
hostname,
|
||||
backend: rw_backend,
|
||||
target_port: 5432,
|
||||
namespace: self.config.namespace.clone(),
|
||||
};
|
||||
// Expose RW publicly via TLS passthrough
|
||||
topo.install_route(tls_route.clone())
|
||||
topo.install_route(self.tls_route.clone())
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e))?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Public CNPG cluster '{}' deployed with TLS passthrough route '{}'",
|
||||
self.config.cluster_name.clone(),
|
||||
tls_route.hostname
|
||||
self.tls_route.hostname
|
||||
)))
|
||||
}
|
||||
|
||||
|
||||
@@ -344,7 +344,7 @@ pub struct StaticMap {
|
||||
pub mac: String,
|
||||
pub ipaddr: String,
|
||||
pub cid: Option<MaybeString>,
|
||||
pub hostname: Option<String>,
|
||||
pub hostname: String,
|
||||
pub descr: Option<MaybeString>,
|
||||
pub winsserver: MaybeString,
|
||||
pub dnsserver: MaybeString,
|
||||
@@ -383,24 +383,24 @@ pub struct Outbound {
|
||||
|
||||
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]
|
||||
pub struct NatRule {
|
||||
pub protocol: Option<String>,
|
||||
pub interface: Option<String>,
|
||||
pub category: Option<MaybeString>,
|
||||
pub ipprotocol: Option<String>,
|
||||
pub descr: Option<MaybeString>,
|
||||
pub tag: Option<MaybeString>,
|
||||
pub protocol: String,
|
||||
pub interface: String,
|
||||
pub category: MaybeString,
|
||||
pub ipprotocol: String,
|
||||
pub descr: MaybeString,
|
||||
pub tag: MaybeString,
|
||||
pub tagged: Option<MaybeString>,
|
||||
pub poolopts: Option<PoolOpts>,
|
||||
pub poolopts: PoolOpts,
|
||||
#[yaserde(rename = "associated-rule-id")]
|
||||
pub associated_rule_id: Option<MaybeString>,
|
||||
pub disabled: Option<u8>,
|
||||
pub target: Option<String>,
|
||||
pub target: String,
|
||||
#[yaserde(rename = "local-port")]
|
||||
pub local_port: Option<i32>,
|
||||
pub source: Option<Source>,
|
||||
pub destination: Option<Destination>,
|
||||
pub updated: Option<Updated>,
|
||||
pub created: Option<Created>,
|
||||
pub local_port: i32,
|
||||
pub source: Source,
|
||||
pub destination: Destination,
|
||||
pub updated: Updated,
|
||||
pub created: Created,
|
||||
}
|
||||
|
||||
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]
|
||||
@@ -1545,7 +1545,7 @@ pub struct Vlans {
|
||||
|
||||
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]
|
||||
pub struct Bridges {
|
||||
pub bridged: Option<RawXml>,
|
||||
pub bridged: Option<MaybeString>,
|
||||
}
|
||||
|
||||
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]
|
||||
|
||||
@@ -48,7 +48,7 @@ impl<'a> DhcpConfigLegacyISC<'a> {
|
||||
hostname: &str,
|
||||
) -> Result<(), DhcpError> {
|
||||
let mac = mac.to_string();
|
||||
let hostname = Some(hostname.to_string());
|
||||
let hostname = hostname.to_string();
|
||||
let lan_dhcpd = self.get_lan_dhcpd();
|
||||
let existing_mappings: &mut Vec<StaticMap> = &mut lan_dhcpd.staticmaps;
|
||||
|
||||
@@ -121,7 +121,7 @@ impl<'a> DhcpConfigLegacyISC<'a> {
|
||||
.map(|entry| StaticMap {
|
||||
mac: entry["mac"].as_str().unwrap_or_default().to_string(),
|
||||
ipaddr: entry["ipaddr"].as_str().unwrap_or_default().to_string(),
|
||||
hostname: Some(entry["hostname"].as_str().unwrap_or_default().to_string()),
|
||||
hostname: entry["hostname"].as_str().unwrap_or_default().to_string(),
|
||||
descr: entry["descr"].as_str().map(MaybeString::from),
|
||||
..Default::default()
|
||||
})
|
||||
|
||||
@@ -213,7 +213,7 @@ impl<'a> DhcpConfigDnsMasq<'a> {
|
||||
.map(|entry| StaticMap {
|
||||
mac: entry["mac"].as_str().unwrap_or_default().to_string(),
|
||||
ipaddr: entry["ipaddr"].as_str().unwrap_or_default().to_string(),
|
||||
hostname: Some(entry["hostname"].as_str().unwrap_or_default().to_string()),
|
||||
hostname: entry["hostname"].as_str().unwrap_or_default().to_string(),
|
||||
descr: entry["descr"].as_str().map(MaybeString::from),
|
||||
..Default::default()
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user