Compare commits
60 Commits
feat/impro
...
feat/clust
| Author | SHA1 | Date | |
|---|---|---|---|
| b1ff4e4a0f | |||
| ee8f033143 | |||
| 1298ac9a18 | |||
| 53e361e84e | |||
| 220e0c2bb8 | |||
| 82e47d22a2 | |||
| fb17d7ed40 | |||
| d4bf80779e | |||
| 28dadf3a70 | |||
| 15c454aa65 | |||
| f9a3e51529 | |||
| d10598d01e | |||
| 61ba7257d0 | |||
| b0e9594d92 | |||
| 2a7fa466cc | |||
| f463cd1e94 | |||
| e1da7949ec | |||
| d0a1a73710 | |||
| bc2b328296 | |||
| a93896707f | |||
| 0e9b23a320 | |||
| f532ba2b40 | |||
| fafca31798 | |||
| 5412c34957 | |||
| 787cc8feab | |||
| ce041f495b | |||
| bfb86f63ce | |||
| 55de206523 | |||
| 64893a84f5 | |||
| f941672662 | |||
| a98113dd40 | |||
| 5db1a31d33 | |||
| f5aac67af8 | |||
| d7e5bf11d5 | |||
| 2e1f1b8447 | |||
| 2b157ad7fd | |||
| a0c0905c3b | |||
| d920de34cf | |||
| 4276b9137b | |||
| 6ab88ab8d9 | |||
| fe52f69473 | |||
| d8338ad12c | |||
| ac9fedf853 | |||
| fd3705e382 | |||
| 4840c7fdc2 | |||
| 20172a7801 | |||
| 6bb33c5845 | |||
| d9357adad3 | |||
| a25ca86bdf | |||
| 646c5e723e | |||
| 69c382e8c6 | |||
| dca764395d | |||
| 53d0704a35 | |||
| 2738985edb | |||
| d9a21bf94b | |||
| 8f8bd34168 | |||
| b5e971b3b6 | |||
| a1c0e0e246 | |||
| d084cee8d5 | |||
| 63ef1c0ea7 |
2604
Cargo.lock
generated
2604
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,6 @@
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
members = [
|
members = [
|
||||||
"private_repos/*",
|
"private_repos/*",
|
||||||
"examples/*",
|
|
||||||
"harmony",
|
"harmony",
|
||||||
"harmony_types",
|
"harmony_types",
|
||||||
"harmony_macros",
|
"harmony_macros",
|
||||||
@@ -17,9 +16,9 @@ members = [
|
|||||||
"harmony_secret_derive",
|
"harmony_secret_derive",
|
||||||
"harmony_secret",
|
"harmony_secret",
|
||||||
"adr/agent_discovery/mdns",
|
"adr/agent_discovery/mdns",
|
||||||
"brocade",
|
"brocade",
|
||||||
"harmony_agent",
|
"harmony_agent",
|
||||||
"harmony_agent/deploy",
|
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
|
||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
@@ -38,6 +37,8 @@ tokio = { version = "1.40", features = [
|
|||||||
"macros",
|
"macros",
|
||||||
"rt-multi-thread",
|
"rt-multi-thread",
|
||||||
] }
|
] }
|
||||||
|
tokio-retry = "0.3.0"
|
||||||
|
tokio-util = "0.7.15"
|
||||||
cidr = { features = ["serde"], version = "0.2" }
|
cidr = { features = ["serde"], version = "0.2" }
|
||||||
russh = "0.45"
|
russh = "0.45"
|
||||||
russh-keys = "0.45"
|
russh-keys = "0.45"
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
use super::BrocadeClient;
|
use super::BrocadeClient;
|
||||||
use crate::{
|
use crate::{
|
||||||
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
|
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
|
||||||
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
|
PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
|
||||||
shell::BrocadeShell,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use regex::Regex;
|
|||||||
use crate::{
|
use crate::{
|
||||||
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
|
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
|
||||||
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
|
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
|
||||||
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
|
parse_brocade_mac_address, shell::BrocadeShell,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::cert_manager::{
|
modules::cert_manager::{
|
||||||
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
|
capability::CertificateManagementConfig, score_certificate::CertificateScore,
|
||||||
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
|
score_issuer::CertificateIssuerScore,
|
||||||
},
|
},
|
||||||
topology::K8sAnywhereTopology,
|
topology::K8sAnywhereTopology,
|
||||||
};
|
};
|
||||||
|
|||||||
16
examples/cluster_dashboards/Cargo.toml
Normal file
16
examples/cluster_dashboards/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[workspace]
|
||||||
|
|
||||||
|
[package]
|
||||||
|
name = "example-cluster-dashboards"
|
||||||
|
edition = "2021"
|
||||||
|
version = "0.1.0"
|
||||||
|
license = "GNU AGPL v3"
|
||||||
|
publish = false
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../../harmony" }
|
||||||
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||||
|
log = "0.4"
|
||||||
|
env_logger = "0.11"
|
||||||
21
examples/cluster_dashboards/src/main.rs
Normal file
21
examples/cluster_dashboards/src/main.rs
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
use harmony::{
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
|
||||||
|
topology::K8sAnywhereTopology,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
harmony_cli::cli_logger::init();
|
||||||
|
|
||||||
|
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||||
|
|
||||||
|
harmony_cli::run(
|
||||||
|
Inventory::autoload(),
|
||||||
|
K8sAnywhereTopology::from_env(),
|
||||||
|
vec![Box::new(cluster_dashboards_score)],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
@@ -11,7 +11,7 @@ async fn main() {
|
|||||||
role: HostRole::Worker,
|
role: HostRole::Worker,
|
||||||
number_desired_hosts: 3,
|
number_desired_hosts: 3,
|
||||||
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
||||||
cidr: cidrv4!("192.168.2.0/24"),
|
cidr: cidrv4!("192.168.0.1/25"),
|
||||||
port: 25000,
|
port: 25000,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -20,7 +20,7 @@ async fn main() {
|
|||||||
role: HostRole::ControlPlane,
|
role: HostRole::ControlPlane,
|
||||||
number_desired_hosts: 3,
|
number_desired_hosts: 3,
|
||||||
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
||||||
cidr: cidrv4!("192.168.2.0/24"),
|
cidr: cidrv4!("192.168.0.1/25"),
|
||||||
port: 25000,
|
port: 25000,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -28,8 +28,7 @@ async fn main() {
|
|||||||
harmony_cli::run(
|
harmony_cli::run(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
LocalhostTopology::new(),
|
LocalhostTopology::new(),
|
||||||
vec![Box::new(discover_worker)],
|
vec![Box::new(discover_worker), Box::new(discover_control_plane)],
|
||||||
//vec![Box::new(discover_worker), Box::new(discover_control_plane)],
|
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -10,9 +10,10 @@ publish = false
|
|||||||
harmony = { path = "../../harmony" }
|
harmony = { path = "../../harmony" }
|
||||||
harmony_cli = { path = "../../harmony_cli" }
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
harmony_types = { path = "../../harmony_types" }
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
harmony_macros = { path = "../../harmony_macros" }
|
||||||
|
harmony-k8s = { path = "../../harmony-k8s" }
|
||||||
cidr.workspace = true
|
cidr.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
harmony_macros = { path = "../../harmony_macros" }
|
|
||||||
log.workspace = true
|
log.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use harmony::topology::k8s::{DrainOptions, K8sClient};
|
use harmony_k8s::{DrainOptions, K8sClient};
|
||||||
use log::{info, trace};
|
use log::{info, trace};
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
|||||||
@@ -10,9 +10,10 @@ publish = false
|
|||||||
harmony = { path = "../../harmony" }
|
harmony = { path = "../../harmony" }
|
||||||
harmony_cli = { path = "../../harmony_cli" }
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
harmony_types = { path = "../../harmony_types" }
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
harmony_macros = { path = "../../harmony_macros" }
|
||||||
|
harmony-k8s = { path = "../../harmony-k8s" }
|
||||||
cidr.workspace = true
|
cidr.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
harmony_macros = { path = "../../harmony_macros" }
|
|
||||||
log.workspace = true
|
log.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile};
|
use harmony_k8s::{K8sClient, NodeFile};
|
||||||
use log::{info, trace};
|
use log::{info, trace};
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
|||||||
16
examples/node_health/Cargo.toml
Normal file
16
examples/node_health/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[package]
|
||||||
|
name = "example-node-health"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
publish = false
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../../harmony" }
|
||||||
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
tokio = { workspace = true }
|
||||||
|
harmony_macros = { path = "../../harmony_macros" }
|
||||||
|
log = { workspace = true }
|
||||||
|
env_logger = { workspace = true }
|
||||||
17
examples/node_health/src/main.rs
Normal file
17
examples/node_health/src/main.rs
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
use harmony::{
|
||||||
|
inventory::Inventory, modules::node_health::NodeHealthScore, topology::K8sAnywhereTopology,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let node_health = NodeHealthScore {};
|
||||||
|
|
||||||
|
harmony_cli::run(
|
||||||
|
Inventory::autoload(),
|
||||||
|
K8sAnywhereTopology::from_env(),
|
||||||
|
vec![Box::new(node_health)],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
@@ -6,7 +6,10 @@ use harmony::{
|
|||||||
data::{FileContent, FilePath},
|
data::{FileContent, FilePath},
|
||||||
modules::{
|
modules::{
|
||||||
inventory::HarmonyDiscoveryStrategy,
|
inventory::HarmonyDiscoveryStrategy,
|
||||||
okd::{installation::OKDInstallationPipeline, ipxe::OKDIpxeScore},
|
okd::{
|
||||||
|
installation::OKDInstallationPipeline, ipxe::OKDIpxeScore,
|
||||||
|
load_balancer::OKDLoadBalancerScore,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::HAClusterTopology,
|
topology::HAClusterTopology,
|
||||||
@@ -32,6 +35,7 @@ async fn main() {
|
|||||||
scores
|
scores
|
||||||
.append(&mut OKDInstallationPipeline::get_all_scores(HarmonyDiscoveryStrategy::MDNS).await);
|
.append(&mut OKDInstallationPipeline::get_all_scores(HarmonyDiscoveryStrategy::MDNS).await);
|
||||||
|
|
||||||
|
scores.push(Box::new(OKDLoadBalancerScore::new(&topology)));
|
||||||
harmony_cli::run(inventory, topology, scores, None)
|
harmony_cli::run(inventory, topology, scores, None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
@@ -1,63 +1,13 @@
|
|||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory, modules::openbao::OpenbaoScore, topology::K8sAnywhereTopology,
|
||||||
modules::helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
|
|
||||||
topology::K8sAnywhereTopology,
|
|
||||||
};
|
};
|
||||||
use harmony_macros::hurl;
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let values_yaml = Some(
|
let openbao = OpenbaoScore {
|
||||||
r#"server:
|
host: "openbao.sebastien.sto1.nationtech.io".to_string(),
|
||||||
standalone:
|
|
||||||
enabled: true
|
|
||||||
config: |
|
|
||||||
listener "tcp" {
|
|
||||||
tls_disable = true
|
|
||||||
address = "[::]:8200"
|
|
||||||
cluster_address = "[::]:8201"
|
|
||||||
}
|
|
||||||
|
|
||||||
storage "file" {
|
|
||||||
path = "/openbao/data"
|
|
||||||
}
|
|
||||||
|
|
||||||
service:
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
dataStorage:
|
|
||||||
enabled: true
|
|
||||||
size: 10Gi
|
|
||||||
storageClass: null
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
|
|
||||||
auditStorage:
|
|
||||||
enabled: true
|
|
||||||
size: 10Gi
|
|
||||||
storageClass: null
|
|
||||||
accessMode: ReadWriteOnce"#
|
|
||||||
.to_string(),
|
|
||||||
);
|
|
||||||
let openbao = HelmChartScore {
|
|
||||||
namespace: Some(NonBlankString::from_str("openbao").unwrap()),
|
|
||||||
release_name: NonBlankString::from_str("openbao").unwrap(),
|
|
||||||
chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
|
|
||||||
chart_version: None,
|
|
||||||
values_overrides: None,
|
|
||||||
values_yaml,
|
|
||||||
create_namespace: true,
|
|
||||||
install_only: true,
|
|
||||||
repository: Some(HelmRepository::new(
|
|
||||||
"openbao".to_string(),
|
|
||||||
hurl!("https://openbao.github.io/openbao-helm"),
|
|
||||||
true,
|
|
||||||
)),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO exec pod commands to initialize secret store if not already done
|
|
||||||
|
|
||||||
harmony_cli::run(
|
harmony_cli::run(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
K8sAnywhereTopology::from_env(),
|
K8sAnywhereTopology::from_env(),
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
|
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
|
||||||
@@ -9,7 +7,7 @@ use harmony::{
|
|||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
|
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
|
||||||
let cnpg_operator = CloudNativePgOperatorScore::default();
|
let cnpg_operator = CloudNativePgOperatorScore::default_openshift();
|
||||||
|
|
||||||
harmony_cli::run(
|
harmony_cli::run(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
|
|||||||
@@ -1,22 +1,13 @@
|
|||||||
use std::{
|
use std::sync::Arc;
|
||||||
net::{IpAddr, Ipv4Addr},
|
|
||||||
sync::Arc,
|
|
||||||
};
|
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cidr::Ipv4Cidr;
|
|
||||||
use harmony::{
|
use harmony::{
|
||||||
executors::ExecutorError,
|
executors::ExecutorError,
|
||||||
hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
|
|
||||||
infra::opnsense::OPNSenseManagementInterface,
|
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::opnsense::node_exporter::NodeExporterScore,
|
modules::opnsense::node_exporter::NodeExporterScore,
|
||||||
topology::{
|
topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
|
||||||
HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
|
|
||||||
UnmanagedRouter, node_exporter::NodeExporter,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
use harmony_macros::{ip, ipv4, mac_address};
|
use harmony_macros::ip;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct OpnSenseTopology {
|
struct OpnSenseTopology {
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::postgresql::{
|
modules::postgresql::{
|
||||||
K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
|
PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
|
||||||
capability::PostgreSQLConfig,
|
|
||||||
},
|
},
|
||||||
topology::K8sAnywhereTopology,
|
topology::K8sAnywhereTopology,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
use std::{path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
use std::{path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ fn build_large_score() -> LoadBalancerScore {
|
|||||||
],
|
],
|
||||||
listening_port: SocketAddr::V4(SocketAddrV4::new(ipv4!("192.168.0.0"), 49387)),
|
listening_port: SocketAddr::V4(SocketAddrV4::new(ipv4!("192.168.0.0"), 49387)),
|
||||||
health_check: Some(HealthCheck::HTTP(
|
health_check: Some(HealthCheck::HTTP(
|
||||||
|
Some(1993),
|
||||||
"/some_long_ass_path_to_see_how_it_is_displayed_but_it_has_to_be_even_longer"
|
"/some_long_ass_path_to_see_how_it_is_displayed_but_it_has_to_be_even_longer"
|
||||||
.to_string(),
|
.to_string(),
|
||||||
HttpMethod::GET,
|
HttpMethod::GET,
|
||||||
|
|||||||
14
examples/zitadel/Cargo.toml
Normal file
14
examples/zitadel/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[package]
|
||||||
|
name = "example-zitadel"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../../harmony" }
|
||||||
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
|
harmony_macros = { path = "../../harmony_macros" }
|
||||||
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
tokio.workspace = true
|
||||||
|
url.workspace = true
|
||||||
20
examples/zitadel/src/main.rs
Normal file
20
examples/zitadel/src/main.rs
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
use harmony::{
|
||||||
|
inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let zitadel = ZitadelScore {
|
||||||
|
host: "sso.sto1.nationtech.io".to_string(),
|
||||||
|
zitadel_version: "v4.12.1".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
harmony_cli::run(
|
||||||
|
Inventory::autoload(),
|
||||||
|
K8sAnywhereTopology::from_env(),
|
||||||
|
vec![Box::new(zitadel)],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
Binary file not shown.
23
harmony-k8s/Cargo.toml
Normal file
23
harmony-k8s/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[package]
|
||||||
|
name = "harmony-k8s"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
kube.workspace = true
|
||||||
|
k8s-openapi.workspace = true
|
||||||
|
tokio.workspace = true
|
||||||
|
tokio-retry.workspace = true
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
|
serde_yaml.workspace = true
|
||||||
|
log.workspace = true
|
||||||
|
similar.workspace = true
|
||||||
|
reqwest.workspace = true
|
||||||
|
url.workspace = true
|
||||||
|
inquire.workspace = true
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
pretty_assertions.workspace = true
|
||||||
593
harmony-k8s/src/apply.rs
Normal file
593
harmony-k8s/src/apply.rs
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
use kube::{
|
||||||
|
Client, Error, Resource,
|
||||||
|
api::{
|
||||||
|
Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
|
||||||
|
ResourceExt,
|
||||||
|
},
|
||||||
|
core::ErrorResponse,
|
||||||
|
discovery::Scope,
|
||||||
|
error::DiscoveryError,
|
||||||
|
};
|
||||||
|
use log::{debug, error, trace, warn};
|
||||||
|
use serde::{Serialize, de::DeserializeOwned};
|
||||||
|
use serde_json::Value;
|
||||||
|
use similar::TextDiff;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::client::K8sClient;
|
||||||
|
use crate::helper;
|
||||||
|
use crate::types::WriteMode;
|
||||||
|
|
||||||
|
/// The field-manager token sent with every server-side apply request.
|
||||||
|
pub const FIELD_MANAGER: &str = "harmony-k8s";
|
||||||
|
|
||||||
|
// ── Private helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
|
||||||
|
fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
|
||||||
|
serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
|
||||||
|
.map_err(Error::SerdeError)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch the current resource, display a unified diff against `payload`, and
|
||||||
|
/// return `()`. All output goes to stdout (same behaviour as before).
|
||||||
|
///
|
||||||
|
/// A 404 is treated as "resource would be created" — not an error.
|
||||||
|
async fn show_dry_run<T: Serialize>(
|
||||||
|
api: &Api<DynamicObject>,
|
||||||
|
name: &str,
|
||||||
|
payload: &T,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let new_yaml = serde_yaml::to_string(payload)
|
||||||
|
.unwrap_or_else(|_| "Failed to serialize new resource".to_string());
|
||||||
|
|
||||||
|
match api.get(name).await {
|
||||||
|
Ok(current) => {
|
||||||
|
println!("\nDry-run for resource: '{name}'");
|
||||||
|
let mut current_val = serde_yaml::to_value(¤t).unwrap_or(serde_yaml::Value::Null);
|
||||||
|
if let Some(map) = current_val.as_mapping_mut() {
|
||||||
|
map.remove(&serde_yaml::Value::String("status".to_string()));
|
||||||
|
}
|
||||||
|
let current_yaml = serde_yaml::to_string(¤t_val)
|
||||||
|
.unwrap_or_else(|_| "Failed to serialize current resource".to_string());
|
||||||
|
|
||||||
|
if current_yaml == new_yaml {
|
||||||
|
println!("No changes detected.");
|
||||||
|
} else {
|
||||||
|
println!("Changes detected:");
|
||||||
|
let diff = TextDiff::from_lines(¤t_yaml, &new_yaml);
|
||||||
|
for change in diff.iter_all_changes() {
|
||||||
|
let sign = match change.tag() {
|
||||||
|
similar::ChangeTag::Delete => "-",
|
||||||
|
similar::ChangeTag::Insert => "+",
|
||||||
|
similar::ChangeTag::Equal => " ",
|
||||||
|
};
|
||||||
|
print!("{sign}{change}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||||
|
println!("\nDry-run for new resource: '{name}'");
|
||||||
|
println!("Resource does not exist. Would be created:");
|
||||||
|
for line in new_yaml.lines() {
|
||||||
|
println!("+{line}");
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to fetch resource '{name}' for dry-run: {e}");
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
|
||||||
|
async fn do_apply<T: Serialize + std::fmt::Debug>(
|
||||||
|
api: &Api<DynamicObject>,
|
||||||
|
name: &str,
|
||||||
|
payload: &T,
|
||||||
|
patch_params: &PatchParams,
|
||||||
|
write_mode: &WriteMode,
|
||||||
|
) -> Result<DynamicObject, Error> {
|
||||||
|
match write_mode {
|
||||||
|
WriteMode::CreateOrUpdate => {
|
||||||
|
// TODO refactor this arm to perform self.update and if fail with 404 self.create
|
||||||
|
// This will avoid the repetition of the api.patch and api.create calls within this
|
||||||
|
// function body. This makes the code more maintainable
|
||||||
|
match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||||
|
Ok(obj) => Ok(obj),
|
||||||
|
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||||
|
debug!("Resource '{name}' not found via SSA, falling back to POST");
|
||||||
|
let dyn_obj = to_dynamic(payload)?;
|
||||||
|
api.create(&PostParams::default(), &dyn_obj)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("Failed to create '{name}': {e}");
|
||||||
|
e
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to apply '{name}': {e}");
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WriteMode::Create => {
|
||||||
|
let dyn_obj = to_dynamic(payload)?;
|
||||||
|
api.create(&PostParams::default(), &dyn_obj)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("Failed to create '{name}': {e}");
|
||||||
|
e
|
||||||
|
})
|
||||||
|
}
|
||||||
|
WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||||
|
Ok(obj) => Ok(obj),
|
||||||
|
Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
|
||||||
|
code: 404,
|
||||||
|
message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
|
||||||
|
reason: "NotFound".to_string(),
|
||||||
|
status: "Failure".to_string(),
|
||||||
|
})),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to update '{name}': {e}");
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Public API ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
/// Server-side apply: create if absent, update if present.
|
||||||
|
/// Equivalent to `kubectl apply`.
|
||||||
|
pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST only — returns an error if the resource already exists.
|
||||||
|
pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
self.apply_with_strategy(resource, namespace, WriteMode::Create)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Server-side apply only — returns an error if the resource does not exist.
|
||||||
|
pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
self.apply_with_strategy(resource, namespace, WriteMode::Update)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn apply_with_strategy<K>(
|
||||||
|
&self,
|
||||||
|
resource: &K,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
write_mode: WriteMode,
|
||||||
|
) -> Result<K, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
debug!(
|
||||||
|
"apply_with_strategy: {:?} ns={:?}",
|
||||||
|
resource.meta().name,
|
||||||
|
namespace
|
||||||
|
);
|
||||||
|
trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
|
||||||
|
|
||||||
|
let dyntype = K::DynamicType::default();
|
||||||
|
let gvk = GroupVersionKind {
|
||||||
|
group: K::group(&dyntype).to_string(),
|
||||||
|
version: K::version(&dyntype).to_string(),
|
||||||
|
kind: K::kind(&dyntype).to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let discovery = self.discovery().await?;
|
||||||
|
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||||
|
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Cannot resolve GVK: {gvk:?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let effective_ns = if caps.scope == Scope::Cluster {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
namespace.or_else(|| resource.meta().namespace.as_deref())
|
||||||
|
};
|
||||||
|
|
||||||
|
let api: Api<DynamicObject> =
|
||||||
|
get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||||
|
|
||||||
|
let name = resource
|
||||||
|
.meta()
|
||||||
|
.name
|
||||||
|
.as_deref()
|
||||||
|
.expect("Kubernetes resource must have a name");
|
||||||
|
|
||||||
|
if self.dry_run {
|
||||||
|
show_dry_run(&api, name, resource).await?;
|
||||||
|
return Ok(resource.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||||
|
do_apply(&api, name, resource, &patch_params, &write_mode)
|
||||||
|
.await
|
||||||
|
.and_then(helper::dyn_to_typed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Applies resources in order, one at a time
|
||||||
|
pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
let mut result = Vec::new();
|
||||||
|
for r in resources.iter() {
|
||||||
|
let res = self.apply(r, ns).await;
|
||||||
|
if res.is_err() {
|
||||||
|
// NOTE: this may log sensitive data; downgrade to debug if needed.
|
||||||
|
warn!(
|
||||||
|
"Failed to apply k8s resource: {}",
|
||||||
|
serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
|
||||||
|
);
|
||||||
|
}
|
||||||
|
result.push(res?);
|
||||||
|
}
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply a [`DynamicObject`] resource using server-side apply.
|
||||||
|
pub async fn apply_dynamic(
|
||||||
|
&self,
|
||||||
|
resource: &DynamicObject,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
force_conflicts: bool,
|
||||||
|
) -> Result<DynamicObject, Error> {
|
||||||
|
trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
|
||||||
|
|
||||||
|
let discovery = self.discovery().await?;
|
||||||
|
let type_meta = resource.types.as_ref().ok_or_else(|| {
|
||||||
|
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||||
|
"DynamicObject must have types (apiVersion and kind)".to_string(),
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
|
||||||
|
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||||
|
"Invalid GVK in DynamicObject: {type_meta:?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||||
|
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Cannot resolve GVK: {gvk:?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let effective_ns = if caps.scope == Scope::Cluster {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
namespace.or_else(|| resource.metadata.namespace.as_deref())
|
||||||
|
};
|
||||||
|
|
||||||
|
let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||||
|
let name = resource.metadata.name.as_deref().ok_or_else(|| {
|
||||||
|
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||||
|
"DynamicObject must have metadata.name".to_string(),
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
|
||||||
|
resource.types.as_ref().map(|t| &t.kind),
|
||||||
|
);
|
||||||
|
|
||||||
|
// NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
|
||||||
|
// pasting the dry_run and some more logic
|
||||||
|
if self.dry_run {
|
||||||
|
show_dry_run(&api, name, resource).await?;
|
||||||
|
return Ok(resource.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||||
|
patch_params.force = force_conflicts;
|
||||||
|
|
||||||
|
do_apply(
|
||||||
|
&api,
|
||||||
|
name,
|
||||||
|
resource,
|
||||||
|
&patch_params,
|
||||||
|
&WriteMode::CreateOrUpdate,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn apply_dynamic_many(
|
||||||
|
&self,
|
||||||
|
resources: &[DynamicObject],
|
||||||
|
namespace: Option<&str>,
|
||||||
|
force_conflicts: bool,
|
||||||
|
) -> Result<Vec<DynamicObject>, Error> {
|
||||||
|
let mut result = Vec::new();
|
||||||
|
for r in resources.iter() {
|
||||||
|
result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
|
||||||
|
}
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn apply_yaml_many(
|
||||||
|
&self,
|
||||||
|
#[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
|
||||||
|
ns: Option<&str>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
for y in yaml.iter() {
|
||||||
|
self.apply_yaml(y, ns).await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn apply_yaml(
|
||||||
|
&self,
|
||||||
|
yaml: &serde_yaml::Value,
|
||||||
|
ns: Option<&str>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
// NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
|
||||||
|
// apply_dynamic instead of reimplementing api interactions?
|
||||||
|
let obj: DynamicObject =
|
||||||
|
serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
|
||||||
|
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
|
||||||
|
|
||||||
|
let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
|
||||||
|
let kind = yaml["kind"].as_str().expect("missing kind");
|
||||||
|
|
||||||
|
let mut it = api_version.splitn(2, '/');
|
||||||
|
let first = it.next().unwrap();
|
||||||
|
let (g, v) = match it.next() {
|
||||||
|
Some(second) => (first, second),
|
||||||
|
None => ("", first),
|
||||||
|
};
|
||||||
|
|
||||||
|
let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
|
||||||
|
let namespace = ns.unwrap_or_else(|| {
|
||||||
|
obj.metadata
|
||||||
|
.namespace
|
||||||
|
.as_deref()
|
||||||
|
.expect("YAML must have a namespace when ns is not provided")
|
||||||
|
});
|
||||||
|
|
||||||
|
let api: Api<DynamicObject> =
|
||||||
|
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
|
||||||
|
|
||||||
|
println!("Applying '{name}' in namespace '{namespace}'...");
|
||||||
|
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||||
|
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
|
||||||
|
println!("Successfully applied '{}'.", result.name_any());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Equivalent to `kubectl apply -f <url>`.
|
||||||
|
pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
|
||||||
|
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||||
|
let discovery = self.discovery().await?;
|
||||||
|
|
||||||
|
let yaml = reqwest::get(url)
|
||||||
|
.await
|
||||||
|
.expect("Could not fetch URL")
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.expect("Could not read response body");
|
||||||
|
|
||||||
|
for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
|
||||||
|
let obj: DynamicObject =
|
||||||
|
serde_yaml::from_value(doc).expect("YAML document is not a valid object");
|
||||||
|
let namespace = obj.metadata.namespace.as_deref().or(ns);
|
||||||
|
let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
|
||||||
|
let gvk =
|
||||||
|
GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
|
||||||
|
let name = obj.name_any();
|
||||||
|
|
||||||
|
if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
|
||||||
|
let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
|
||||||
|
trace!(
|
||||||
|
"Applying {}:\n{}",
|
||||||
|
gvk.kind,
|
||||||
|
serde_yaml::to_string(&obj).unwrap_or_default()
|
||||||
|
);
|
||||||
|
let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
|
||||||
|
let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
|
||||||
|
debug!("Applied {} '{name}'", gvk.kind);
|
||||||
|
} else {
|
||||||
|
warn!("Skipping document with unknown GVK: {gvk:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
|
||||||
|
pub(crate) fn get_api_for_dynamic_object(
|
||||||
|
&self,
|
||||||
|
object: &DynamicObject,
|
||||||
|
ns: Option<&str>,
|
||||||
|
) -> Result<Api<DynamicObject>, Error> {
|
||||||
|
let ar = object
|
||||||
|
.types
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|t| {
|
||||||
|
let parts: Vec<&str> = t.api_version.split('/').collect();
|
||||||
|
match parts.as_slice() {
|
||||||
|
[version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||||
|
"", version, &t.kind,
|
||||||
|
))),
|
||||||
|
[group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||||
|
group, version, &t.kind,
|
||||||
|
))),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.ok_or_else(|| {
|
||||||
|
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||||
|
"Invalid apiVersion in DynamicObject: {object:#?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(match ns {
|
||||||
|
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||||
|
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Free functions ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub(crate) fn get_dynamic_api(
|
||||||
|
resource: kube::api::ApiResource,
|
||||||
|
capabilities: kube::discovery::ApiCapabilities,
|
||||||
|
client: Client,
|
||||||
|
ns: Option<&str>,
|
||||||
|
all: bool,
|
||||||
|
) -> Api<DynamicObject> {
|
||||||
|
if capabilities.scope == Scope::Cluster || all {
|
||||||
|
Api::all_with(client, &resource)
|
||||||
|
} else if let Some(namespace) = ns {
|
||||||
|
Api::namespaced_with(client, namespace, &resource)
|
||||||
|
} else {
|
||||||
|
Api::default_namespaced_with(client, &resource)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn multidoc_deserialize(
|
||||||
|
data: &str,
|
||||||
|
) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
|
||||||
|
use serde::Deserialize;
|
||||||
|
let mut docs = vec![];
|
||||||
|
for de in serde_yaml::Deserializer::from_str(data) {
|
||||||
|
docs.push(serde_yaml::Value::deserialize(de)?);
|
||||||
|
}
|
||||||
|
Ok(docs)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod apply_tests {
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
use k8s_openapi::api::core::v1::ConfigMap;
|
||||||
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||||
|
use kube::api::{DeleteParams, TypeMeta};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "requires kubernetes cluster"]
|
||||||
|
async fn apply_creates_new_configmap() {
|
||||||
|
let client = K8sClient::try_default().await.unwrap();
|
||||||
|
let ns = "default";
|
||||||
|
let name = format!(
|
||||||
|
"test-cm-{}",
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap()
|
||||||
|
.as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
|
let cm = ConfigMap {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
namespace: Some(ns.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
assert!(client.apply(&cm, Some(ns)).await.is_ok());
|
||||||
|
|
||||||
|
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||||
|
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "requires kubernetes cluster"]
|
||||||
|
async fn apply_is_idempotent() {
|
||||||
|
let client = K8sClient::try_default().await.unwrap();
|
||||||
|
let ns = "default";
|
||||||
|
let name = format!(
|
||||||
|
"test-idem-{}",
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap()
|
||||||
|
.as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
|
let cm = ConfigMap {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
namespace: Some(ns.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||||
|
"first apply failed"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||||
|
"second apply failed (not idempotent)"
|
||||||
|
);
|
||||||
|
|
||||||
|
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||||
|
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "requires kubernetes cluster"]
|
||||||
|
async fn apply_dynamic_creates_new_resource() {
|
||||||
|
let client = K8sClient::try_default().await.unwrap();
|
||||||
|
let ns = "default";
|
||||||
|
let name = format!(
|
||||||
|
"test-dyn-{}",
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap()
|
||||||
|
.as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
|
let obj = DynamicObject {
|
||||||
|
types: Some(TypeMeta {
|
||||||
|
api_version: "v1".to_string(),
|
||||||
|
kind: "ConfigMap".to_string(),
|
||||||
|
}),
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
namespace: Some(ns.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
data: serde_json::json!({}),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = client.apply_dynamic(&obj, Some(ns), false).await;
|
||||||
|
assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
|
||||||
|
|
||||||
|
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||||
|
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -25,9 +25,9 @@
|
|||||||
//!
|
//!
|
||||||
//! ## Example
|
//! ## Example
|
||||||
//!
|
//!
|
||||||
//! ```rust,no_run
|
//! ```
|
||||||
//! use harmony::topology::k8s::{K8sClient, helper};
|
//! use harmony_k8s::{K8sClient, helper};
|
||||||
//! use harmony::topology::KubernetesDistribution;
|
//! use harmony_k8s::KubernetesDistribution;
|
||||||
//!
|
//!
|
||||||
//! async fn write_network_config(client: &K8sClient, node: &str) {
|
//! async fn write_network_config(client: &K8sClient, node: &str) {
|
||||||
//! // Create a bundle with platform-specific RBAC
|
//! // Create a bundle with platform-specific RBAC
|
||||||
@@ -56,7 +56,7 @@ use kube::{Error, Resource, ResourceExt, api::DynamicObject};
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
use crate::domain::topology::k8s::K8sClient;
|
use crate::K8sClient;
|
||||||
|
|
||||||
/// A ResourceBundle represents a logical unit of work consisting of multiple
|
/// A ResourceBundle represents a logical unit of work consisting of multiple
|
||||||
/// Kubernetes resources that should be applied or deleted together.
|
/// Kubernetes resources that should be applied or deleted together.
|
||||||
99
harmony-k8s/src/client.rs
Normal file
99
harmony-k8s/src/client.rs
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use kube::config::{KubeConfigOptions, Kubeconfig};
|
||||||
|
use kube::{Client, Config, Discovery, Error};
|
||||||
|
use log::error;
|
||||||
|
use serde::Serialize;
|
||||||
|
use tokio::sync::OnceCell;
|
||||||
|
|
||||||
|
use crate::types::KubernetesDistribution;
|
||||||
|
|
||||||
|
// TODO not cool, should use a proper configuration mechanism
|
||||||
|
// cli arg, env var, config file
|
||||||
|
fn read_dry_run_from_env() -> bool {
|
||||||
|
std::env::var("DRY_RUN")
|
||||||
|
.map(|v| v == "true" || v == "1")
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct K8sClient {
|
||||||
|
pub(crate) client: Client,
|
||||||
|
/// When `true` no mutation is sent to the API server; diffs are printed
|
||||||
|
/// to stdout instead. Initialised from the `DRY_RUN` environment variable.
|
||||||
|
pub(crate) dry_run: bool,
|
||||||
|
pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
|
||||||
|
pub(crate) discovery: Arc<OnceCell<Discovery>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for K8sClient {
|
||||||
|
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
todo!("K8sClient serialization is not meaningful; remove this impl if unused")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for K8sClient {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.write_fmt(format_args!(
|
||||||
|
"K8sClient {{ namespace: {}, dry_run: {} }}",
|
||||||
|
self.client.default_namespace(),
|
||||||
|
self.dry_run,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
/// Create a client, reading `DRY_RUN` from the environment.
|
||||||
|
pub fn new(client: Client) -> Self {
|
||||||
|
Self {
|
||||||
|
dry_run: read_dry_run_from_env(),
|
||||||
|
client,
|
||||||
|
k8s_distribution: Arc::new(OnceCell::new()),
|
||||||
|
discovery: Arc::new(OnceCell::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a client that always operates in dry-run mode, regardless of
|
||||||
|
/// the environment variable.
|
||||||
|
pub fn new_dry_run(client: Client) -> Self {
|
||||||
|
Self {
|
||||||
|
dry_run: true,
|
||||||
|
..Self::new(client)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if this client is operating in dry-run mode.
|
||||||
|
pub fn is_dry_run(&self) -> bool {
|
||||||
|
self.dry_run
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn try_default() -> Result<Self, Error> {
|
||||||
|
Ok(Self::new(Client::try_default().await?))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn from_kubeconfig(path: &str) -> Option<Self> {
|
||||||
|
Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
|
||||||
|
let mut opts = KubeConfigOptions::default();
|
||||||
|
opts.context = context;
|
||||||
|
Self::from_kubeconfig_with_opts(path, &opts).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
|
||||||
|
let k = match Kubeconfig::read_from(path) {
|
||||||
|
Ok(k) => k,
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to load kubeconfig from {path}: {e}");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Some(Self::new(
|
||||||
|
Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
83
harmony-k8s/src/discovery.rs
Normal file
83
harmony-k8s/src/discovery.rs
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use kube::{Discovery, Error};
|
||||||
|
use log::{debug, error, info, trace, warn};
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use tokio_retry::{Retry, strategy::ExponentialBackoff};
|
||||||
|
|
||||||
|
use crate::client::K8sClient;
|
||||||
|
use crate::types::KubernetesDistribution;
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
pub async fn get_apiserver_version(
|
||||||
|
&self,
|
||||||
|
) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
|
||||||
|
self.client.clone().apiserver_version().await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
|
||||||
|
pub async fn discovery(&self) -> Result<&Discovery, Error> {
|
||||||
|
let retry_strategy = ExponentialBackoff::from_millis(1000)
|
||||||
|
.max_delay(Duration::from_secs(32))
|
||||||
|
.take(6);
|
||||||
|
|
||||||
|
let attempt = Mutex::new(0u32);
|
||||||
|
Retry::spawn(retry_strategy, || async {
|
||||||
|
let mut n = attempt.lock().await;
|
||||||
|
*n += 1;
|
||||||
|
match self
|
||||||
|
.discovery
|
||||||
|
.get_or_try_init(async || {
|
||||||
|
debug!("Running Kubernetes API discovery (attempt {})", *n);
|
||||||
|
let d = Discovery::new(self.client.clone()).run().await?;
|
||||||
|
debug!("Kubernetes API discovery completed");
|
||||||
|
Ok(d)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(d) => Ok(d),
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("Kubernetes API discovery failed after all retries: {}", e);
|
||||||
|
e
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect which Kubernetes distribution is running. Result is cached for
|
||||||
|
/// the lifetime of the client.
|
||||||
|
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
|
||||||
|
self.k8s_distribution
|
||||||
|
.get_or_try_init(async || {
|
||||||
|
debug!("Detecting Kubernetes distribution");
|
||||||
|
let api_groups = self.client.list_api_groups().await?;
|
||||||
|
trace!("list_api_groups: {:?}", api_groups);
|
||||||
|
|
||||||
|
let version = self.get_apiserver_version().await?;
|
||||||
|
|
||||||
|
if api_groups
|
||||||
|
.groups
|
||||||
|
.iter()
|
||||||
|
.any(|g| g.name == "project.openshift.io")
|
||||||
|
{
|
||||||
|
info!("Detected distribution: OpenshiftFamily");
|
||||||
|
return Ok(KubernetesDistribution::OpenshiftFamily);
|
||||||
|
}
|
||||||
|
|
||||||
|
if version.git_version.contains("k3s") {
|
||||||
|
info!("Detected distribution: K3sFamily");
|
||||||
|
return Ok(KubernetesDistribution::K3sFamily);
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Distribution not identified, using Default");
|
||||||
|
Ok(KubernetesDistribution::Default)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.cloned()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::topology::KubernetesDistribution;
|
use crate::KubernetesDistribution;
|
||||||
|
|
||||||
use super::bundle::ResourceBundle;
|
use super::bundle::ResourceBundle;
|
||||||
use super::config::PRIVILEGED_POD_IMAGE;
|
use super::config::PRIVILEGED_POD_IMAGE;
|
||||||
@@ -10,8 +10,10 @@ use k8s_openapi::api::core::v1::{
|
|||||||
};
|
};
|
||||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||||
|
use kube::api::DynamicObject;
|
||||||
use kube::error::DiscoveryError;
|
use kube::error::DiscoveryError;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
|
use serde::de::DeserializeOwned;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct PrivilegedPodConfig {
|
pub struct PrivilegedPodConfig {
|
||||||
@@ -131,9 +133,9 @@ pub fn host_root_volume() -> (Volume, VolumeMount) {
|
|||||||
///
|
///
|
||||||
/// # Example
|
/// # Example
|
||||||
///
|
///
|
||||||
/// ```rust,no_run
|
/// ```
|
||||||
/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
||||||
/// # use harmony::topology::KubernetesDistribution;
|
/// use harmony_k8s::KubernetesDistribution;
|
||||||
/// let bundle = build_privileged_bundle(
|
/// let bundle = build_privileged_bundle(
|
||||||
/// PrivilegedPodConfig {
|
/// PrivilegedPodConfig {
|
||||||
/// name: "network-setup".to_string(),
|
/// name: "network-setup".to_string(),
|
||||||
@@ -279,6 +281,16 @@ pub fn prompt_drain_timeout_action(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// JSON round-trip: DynamicObject → K
|
||||||
|
///
|
||||||
|
/// Safe because the DynamicObject was produced by the apiserver from a
|
||||||
|
/// payload that was originally serialized from K, so the schema is identical.
|
||||||
|
pub(crate) fn dyn_to_typed<K: DeserializeOwned>(obj: DynamicObject) -> Result<K, kube::Error> {
|
||||||
|
serde_json::to_value(obj)
|
||||||
|
.and_then(serde_json::from_value)
|
||||||
|
.map_err(kube::Error::SerdeError)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
13
harmony-k8s/src/lib.rs
Normal file
13
harmony-k8s/src/lib.rs
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
pub mod apply;
|
||||||
|
pub mod bundle;
|
||||||
|
pub mod client;
|
||||||
|
pub mod config;
|
||||||
|
pub mod discovery;
|
||||||
|
pub mod helper;
|
||||||
|
pub mod node;
|
||||||
|
pub mod pod;
|
||||||
|
pub mod resources;
|
||||||
|
pub mod types;
|
||||||
|
|
||||||
|
pub use client::K8sClient;
|
||||||
|
pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};
|
||||||
3
harmony-k8s/src/main.rs
Normal file
3
harmony-k8s/src/main.rs
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
fn main() {
|
||||||
|
println!("Hello, world!");
|
||||||
|
}
|
||||||
722
harmony-k8s/src/node.rs
Normal file
722
harmony-k8s/src/node.rs
Normal file
@@ -0,0 +1,722 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
use k8s_openapi::api::core::v1::{
|
||||||
|
ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
|
||||||
|
};
|
||||||
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||||
|
use kube::{
|
||||||
|
Error,
|
||||||
|
api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
|
||||||
|
core::ErrorResponse,
|
||||||
|
error::DiscoveryError,
|
||||||
|
};
|
||||||
|
use log::{debug, error, info, warn};
|
||||||
|
use tokio::time::sleep;
|
||||||
|
|
||||||
|
use crate::client::K8sClient;
|
||||||
|
use crate::helper::{self, PrivilegedPodConfig};
|
||||||
|
use crate::types::{DrainOptions, NodeFile};
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||||
|
Api::<Node>::all(self.client.clone())
|
||||||
|
.cordon(node_name)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||||
|
Api::<Node>::all(self.client.clone())
|
||||||
|
.uncordon(node_name)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
|
||||||
|
self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_for_node_ready_with_timeout(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let api: Api<Node> = Api::all(self.client.clone());
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
let poll = Duration::from_secs(5);
|
||||||
|
loop {
|
||||||
|
if start.elapsed() > timeout {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Node '{node_name}' did not become Ready within {timeout:?}"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
match api.get(node_name).await {
|
||||||
|
Ok(node) => {
|
||||||
|
if node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.map(|conds| {
|
||||||
|
conds
|
||||||
|
.iter()
|
||||||
|
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||||
|
})
|
||||||
|
.unwrap_or(false)
|
||||||
|
{
|
||||||
|
debug!("Node '{node_name}' is Ready");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||||
|
}
|
||||||
|
sleep(poll).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_for_node_not_ready(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let api: Api<Node> = Api::all(self.client.clone());
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
let poll = Duration::from_secs(5);
|
||||||
|
loop {
|
||||||
|
if start.elapsed() > timeout {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Node '{node_name}' did not become NotReady within {timeout:?}"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
match api.get(node_name).await {
|
||||||
|
Ok(node) => {
|
||||||
|
let is_ready = node
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.map(|conds| {
|
||||||
|
conds
|
||||||
|
.iter()
|
||||||
|
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
if !is_ready {
|
||||||
|
debug!("Node '{node_name}' is NotReady");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||||
|
}
|
||||||
|
sleep(poll).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
|
||||||
|
let api: Api<Pod> = Api::all(self.client.clone());
|
||||||
|
Ok(api
|
||||||
|
.list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
|
||||||
|
.await?
|
||||||
|
.items)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_mirror_pod(pod: &Pod) -> bool {
|
||||||
|
pod.metadata
|
||||||
|
.annotations
|
||||||
|
.as_ref()
|
||||||
|
.map(|a| a.contains_key("kubernetes.io/config.mirror"))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_daemonset_pod(pod: &Pod) -> bool {
|
||||||
|
pod.metadata
|
||||||
|
.owner_references
|
||||||
|
.as_ref()
|
||||||
|
.map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn has_emptydir_volume(pod: &Pod) -> bool {
|
||||||
|
pod.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.volumes.as_ref())
|
||||||
|
.map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_completed_pod(pod: &Pod) -> bool {
|
||||||
|
pod.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.phase.as_deref())
|
||||||
|
.map(|phase| phase == "Succeeded" || phase == "Failed")
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_pods_for_drain(
|
||||||
|
pods: &[Pod],
|
||||||
|
options: &DrainOptions,
|
||||||
|
) -> Result<(Vec<Pod>, Vec<String>), String> {
|
||||||
|
let mut evictable = Vec::new();
|
||||||
|
let mut skipped = Vec::new();
|
||||||
|
let mut blocking = Vec::new();
|
||||||
|
|
||||||
|
for pod in pods {
|
||||||
|
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
|
||||||
|
let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
|
||||||
|
let qualified = format!("{ns}/{name}");
|
||||||
|
|
||||||
|
if Self::is_mirror_pod(pod) {
|
||||||
|
skipped.push(format!("{qualified} (mirror pod)"));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if Self::is_completed_pod(pod) {
|
||||||
|
skipped.push(format!("{qualified} (completed)"));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if Self::is_daemonset_pod(pod) {
|
||||||
|
if options.ignore_daemonsets {
|
||||||
|
skipped.push(format!("{qualified} (DaemonSet-managed)"));
|
||||||
|
} else {
|
||||||
|
blocking.push(format!(
|
||||||
|
"{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
|
||||||
|
blocking.push(format!(
|
||||||
|
"{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
|
||||||
|
));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
evictable.push(pod.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if !blocking.is_empty() {
|
||||||
|
return Err(format!(
|
||||||
|
"Cannot drain node — the following pods block eviction:\n - {}",
|
||||||
|
blocking.join("\n - ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Ok((evictable, skipped))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
|
||||||
|
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||||
|
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||||
|
debug!("Evicting pod {ns}/{name}");
|
||||||
|
Api::<Pod>::namespaced(self.client.clone(), ns)
|
||||||
|
.evict(name, &EvictParams::default())
|
||||||
|
.await
|
||||||
|
.map(|_| ())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drains a node: cordon → classify → evict & wait.
|
||||||
|
pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
|
||||||
|
debug!("Cordoning '{node_name}'");
|
||||||
|
self.cordon_node(node_name).await?;
|
||||||
|
|
||||||
|
let pods = self.list_pods_on_node(node_name).await?;
|
||||||
|
debug!("Found {} pod(s) on '{node_name}'", pods.len());
|
||||||
|
|
||||||
|
let (evictable, skipped) =
|
||||||
|
Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
|
||||||
|
error!("{msg}");
|
||||||
|
Error::Discovery(DiscoveryError::MissingResource(msg))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
for s in &skipped {
|
||||||
|
info!("Skipping pod: {s}");
|
||||||
|
}
|
||||||
|
if evictable.is_empty() {
|
||||||
|
info!("No pods to evict on '{node_name}'");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
|
||||||
|
|
||||||
|
let mut start = tokio::time::Instant::now();
|
||||||
|
let poll = Duration::from_secs(5);
|
||||||
|
let mut pending = evictable;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
for pod in &pending {
|
||||||
|
match self.evict_pod(pod).await {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
|
||||||
|
Err(Error::Api(ErrorResponse { code: 429, .. })) => {
|
||||||
|
warn!(
|
||||||
|
"PDB blocked eviction of {}/{}; will retry",
|
||||||
|
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||||
|
pod.metadata.name.as_deref().unwrap_or("")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(
|
||||||
|
"Failed to evict {}/{}: {e}",
|
||||||
|
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||||
|
pod.metadata.name.as_deref().unwrap_or("")
|
||||||
|
);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(poll).await;
|
||||||
|
|
||||||
|
let mut still_present = Vec::new();
|
||||||
|
for pod in pending {
|
||||||
|
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||||
|
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||||
|
match self.get_pod(name, Some(ns)).await? {
|
||||||
|
Some(_) => still_present.push(pod),
|
||||||
|
None => debug!("Pod {ns}/{name} evicted"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pending = still_present;
|
||||||
|
|
||||||
|
if pending.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if start.elapsed() > options.timeout {
|
||||||
|
match helper::prompt_drain_timeout_action(
|
||||||
|
node_name,
|
||||||
|
pending.len(),
|
||||||
|
options.timeout,
|
||||||
|
)? {
|
||||||
|
helper::DrainTimeoutAction::Accept => break,
|
||||||
|
helper::DrainTimeoutAction::Retry => {
|
||||||
|
start = tokio::time::Instant::now();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
helper::DrainTimeoutAction::Abort => {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Drain aborted. {} pod(s) remaining on '{node_name}'",
|
||||||
|
pending.len()
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("'{node_name}' drained successfully");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
|
||||||
|
pub async fn reboot_node(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
drain_options: &DrainOptions,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
info!("Starting reboot for '{node_name}'");
|
||||||
|
let node_api: Api<Node> = Api::all(self.client.clone());
|
||||||
|
|
||||||
|
let boot_id_before = node_api
|
||||||
|
.get(node_name)
|
||||||
|
.await?
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.node_info.as_ref())
|
||||||
|
.map(|ni| ni.boot_id.clone())
|
||||||
|
.ok_or_else(|| {
|
||||||
|
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Node '{node_name}' has no boot_id in status"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
info!("Draining '{node_name}'");
|
||||||
|
self.drain_node(node_name, drain_options).await?;
|
||||||
|
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
|
||||||
|
info!("Scheduling reboot for '{node_name}'");
|
||||||
|
let reboot_cmd =
|
||||||
|
"echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
|
||||||
|
match self
|
||||||
|
.run_privileged_command_on_node(node_name, reboot_cmd)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(_) => debug!("Reboot command dispatched"),
|
||||||
|
Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Waiting for '{node_name}' to begin shutdown");
|
||||||
|
self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if start.elapsed() > timeout {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Timeout during reboot of '{node_name}' (shutdown phase)"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Waiting for '{node_name}' to come back online");
|
||||||
|
self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if start.elapsed() > timeout {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Timeout during reboot of '{node_name}' (ready phase)"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
|
||||||
|
let boot_id_after = node_api
|
||||||
|
.get(node_name)
|
||||||
|
.await?
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.node_info.as_ref())
|
||||||
|
.map(|ni| ni.boot_id.clone())
|
||||||
|
.ok_or_else(|| {
|
||||||
|
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Node '{node_name}' has no boot_id after reboot"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if boot_id_before == boot_id_after {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
|
||||||
|
self.uncordon_node(node_name).await?;
|
||||||
|
info!("'{node_name}' reboot complete ({:?})", start.elapsed());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write a set of files to a node's filesystem via a privileged ephemeral pod.
|
||||||
|
pub async fn write_files_to_node(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
files: &[NodeFile],
|
||||||
|
) -> Result<String, Error> {
|
||||||
|
let ns = self.client.default_namespace();
|
||||||
|
let suffix = SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap()
|
||||||
|
.as_millis();
|
||||||
|
let name = format!("harmony-k8s-writer-{suffix}");
|
||||||
|
|
||||||
|
debug!("Writing {} file(s) to '{node_name}'", files.len());
|
||||||
|
|
||||||
|
let mut data = BTreeMap::new();
|
||||||
|
let mut script = String::from("set -e\n");
|
||||||
|
for (i, file) in files.iter().enumerate() {
|
||||||
|
let key = format!("f{i}");
|
||||||
|
data.insert(key.clone(), file.content.clone());
|
||||||
|
script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
|
||||||
|
script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
|
||||||
|
script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
|
||||||
|
}
|
||||||
|
|
||||||
|
let cm = ConfigMap {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
namespace: Some(ns.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
data: Some(data),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
|
||||||
|
cm_api.create(&PostParams::default(), &cm).await?;
|
||||||
|
debug!("Created ConfigMap '{name}'");
|
||||||
|
|
||||||
|
let (host_vol, host_mount) = helper::host_root_volume();
|
||||||
|
let payload_vol = Volume {
|
||||||
|
name: "payload".to_string(),
|
||||||
|
config_map: Some(ConfigMapVolumeSource {
|
||||||
|
name: name.clone(),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let payload_mount = VolumeMount {
|
||||||
|
name: "payload".to_string(),
|
||||||
|
mount_path: "/payload".to_string(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let bundle = helper::build_privileged_bundle(
|
||||||
|
PrivilegedPodConfig {
|
||||||
|
name: name.clone(),
|
||||||
|
namespace: ns.to_string(),
|
||||||
|
node_name: node_name.to_string(),
|
||||||
|
container_name: "writer".to_string(),
|
||||||
|
command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
|
||||||
|
volumes: vec![payload_vol, host_vol],
|
||||||
|
volume_mounts: vec![payload_mount, host_mount],
|
||||||
|
host_pid: false,
|
||||||
|
host_network: false,
|
||||||
|
},
|
||||||
|
&self.get_k8s_distribution().await?,
|
||||||
|
);
|
||||||
|
|
||||||
|
bundle.apply(self).await?;
|
||||||
|
debug!("Created privileged pod bundle '{name}'");
|
||||||
|
|
||||||
|
let result = self.wait_for_pod_completion(&name, ns).await;
|
||||||
|
|
||||||
|
debug!("Cleaning up '{name}'");
|
||||||
|
let _ = bundle.delete(self).await;
|
||||||
|
let _ = cm_api.delete(&name, &DeleteParams::default()).await;
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a privileged command on a node via an ephemeral pod.
|
||||||
|
pub async fn run_privileged_command_on_node(
|
||||||
|
&self,
|
||||||
|
node_name: &str,
|
||||||
|
command: &str,
|
||||||
|
) -> Result<String, Error> {
|
||||||
|
let namespace = self.client.default_namespace();
|
||||||
|
let suffix = SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.unwrap()
|
||||||
|
.as_millis();
|
||||||
|
let name = format!("harmony-k8s-cmd-{suffix}");
|
||||||
|
|
||||||
|
debug!("Running privileged command on '{node_name}': {command}");
|
||||||
|
|
||||||
|
let (host_vol, host_mount) = helper::host_root_volume();
|
||||||
|
let bundle = helper::build_privileged_bundle(
|
||||||
|
PrivilegedPodConfig {
|
||||||
|
name: name.clone(),
|
||||||
|
namespace: namespace.to_string(),
|
||||||
|
node_name: node_name.to_string(),
|
||||||
|
container_name: "runner".to_string(),
|
||||||
|
command: vec![
|
||||||
|
"/bin/bash".to_string(),
|
||||||
|
"-c".to_string(),
|
||||||
|
command.to_string(),
|
||||||
|
],
|
||||||
|
volumes: vec![host_vol],
|
||||||
|
volume_mounts: vec![host_mount],
|
||||||
|
host_pid: true,
|
||||||
|
host_network: true,
|
||||||
|
},
|
||||||
|
&self.get_k8s_distribution().await?,
|
||||||
|
);
|
||||||
|
|
||||||
|
bundle.apply(self).await?;
|
||||||
|
debug!("Privileged pod '{name}' created");
|
||||||
|
|
||||||
|
let result = self.wait_for_pod_completion(&name, namespace).await;
|
||||||
|
|
||||||
|
debug!("Cleaning up '{name}'");
|
||||||
|
let _ = bundle.delete(self).await;
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
|
||||||
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn base_pod(name: &str, ns: &str) -> Pod {
|
||||||
|
Pod {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.to_string()),
|
||||||
|
namespace: Some(ns.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
spec: Some(PodSpec::default()),
|
||||||
|
status: Some(PodStatus {
|
||||||
|
phase: Some("Running".to_string()),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mirror_pod(name: &str, ns: &str) -> Pod {
|
||||||
|
let mut pod = base_pod(name, ns);
|
||||||
|
pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
|
||||||
|
"kubernetes.io/config.mirror".to_string(),
|
||||||
|
"abc123".to_string(),
|
||||||
|
)]));
|
||||||
|
pod
|
||||||
|
}
|
||||||
|
|
||||||
|
fn daemonset_pod(name: &str, ns: &str) -> Pod {
|
||||||
|
let mut pod = base_pod(name, ns);
|
||||||
|
pod.metadata.owner_references = Some(vec![OwnerReference {
|
||||||
|
api_version: "apps/v1".to_string(),
|
||||||
|
kind: "DaemonSet".to_string(),
|
||||||
|
name: "some-ds".to_string(),
|
||||||
|
uid: "uid-ds".to_string(),
|
||||||
|
..Default::default()
|
||||||
|
}]);
|
||||||
|
pod
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emptydir_pod(name: &str, ns: &str) -> Pod {
|
||||||
|
let mut pod = base_pod(name, ns);
|
||||||
|
pod.spec = Some(PodSpec {
|
||||||
|
volumes: Some(vec![Volume {
|
||||||
|
name: "scratch".to_string(),
|
||||||
|
empty_dir: Some(EmptyDirVolumeSource::default()),
|
||||||
|
..Default::default()
|
||||||
|
}]),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
pod
|
||||||
|
}
|
||||||
|
|
||||||
|
fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
|
||||||
|
let mut pod = base_pod(name, ns);
|
||||||
|
pod.status = Some(PodStatus {
|
||||||
|
phase: Some(phase.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
pod
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_opts() -> DrainOptions {
|
||||||
|
DrainOptions::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
// All test bodies are identical to the original — only the module path changed.
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_pod_list_returns_empty_vecs() {
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
|
||||||
|
assert!(e.is_empty());
|
||||||
|
assert!(s.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn normal_pod_is_evictable() {
|
||||||
|
let pods = vec![base_pod("web", "default")];
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||||
|
assert_eq!(e.len(), 1);
|
||||||
|
assert!(s.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mirror_pod_is_skipped() {
|
||||||
|
let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||||
|
assert!(e.is_empty());
|
||||||
|
assert!(s[0].contains("mirror pod"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn completed_pods_are_skipped() {
|
||||||
|
for phase in ["Succeeded", "Failed"] {
|
||||||
|
let pods = vec![completed_pod("job", "batch", phase)];
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||||
|
assert!(e.is_empty());
|
||||||
|
assert!(s[0].contains("completed"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn daemonset_skipped_when_ignored() {
|
||||||
|
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||||
|
let opts = DrainOptions {
|
||||||
|
ignore_daemonsets: true,
|
||||||
|
..default_opts()
|
||||||
|
};
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||||
|
assert!(e.is_empty());
|
||||||
|
assert!(s[0].contains("DaemonSet-managed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn daemonset_blocks_when_not_ignored() {
|
||||||
|
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||||
|
let opts = DrainOptions {
|
||||||
|
ignore_daemonsets: false,
|
||||||
|
..default_opts()
|
||||||
|
};
|
||||||
|
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||||
|
assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn emptydir_blocks_without_flag() {
|
||||||
|
let pods = vec![emptydir_pod("cache", "default")];
|
||||||
|
let opts = DrainOptions {
|
||||||
|
delete_emptydir_data: false,
|
||||||
|
..default_opts()
|
||||||
|
};
|
||||||
|
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||||
|
assert!(err.contains("emptyDir") && err.contains("default/cache"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn emptydir_evictable_with_flag() {
|
||||||
|
let pods = vec![emptydir_pod("cache", "default")];
|
||||||
|
let opts = DrainOptions {
|
||||||
|
delete_emptydir_data: true,
|
||||||
|
..default_opts()
|
||||||
|
};
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||||
|
assert_eq!(e.len(), 1);
|
||||||
|
assert!(s.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_blocking_all_reported() {
|
||||||
|
let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
|
||||||
|
let opts = DrainOptions {
|
||||||
|
ignore_daemonsets: false,
|
||||||
|
delete_emptydir_data: false,
|
||||||
|
..default_opts()
|
||||||
|
};
|
||||||
|
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||||
|
assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mixed_pods_classified_correctly() {
|
||||||
|
let pods = vec![
|
||||||
|
base_pod("web", "default"),
|
||||||
|
mirror_pod("kube-apiserver", "kube-system"),
|
||||||
|
daemonset_pod("fluentd", "logging"),
|
||||||
|
completed_pod("job", "batch", "Succeeded"),
|
||||||
|
base_pod("api", "default"),
|
||||||
|
];
|
||||||
|
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||||
|
let names: Vec<&str> = e
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.metadata.name.as_deref().unwrap())
|
||||||
|
.collect();
|
||||||
|
assert_eq!(names, vec!["web", "api"]);
|
||||||
|
assert_eq!(s.len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mirror_checked_before_completed() {
|
||||||
|
let mut pod = mirror_pod("static-etcd", "kube-system");
|
||||||
|
pod.status = Some(PodStatus {
|
||||||
|
phase: Some("Succeeded".to_string()),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||||
|
assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn completed_checked_before_daemonset() {
|
||||||
|
let mut pod = daemonset_pod("collector", "monitoring");
|
||||||
|
pod.status = Some(PodStatus {
|
||||||
|
phase: Some("Failed".to_string()),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||||
|
assert!(s[0].contains("completed"), "got: {}", s[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
193
harmony-k8s/src/pod.rs
Normal file
193
harmony-k8s/src/pod.rs
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use k8s_openapi::api::core::v1::Pod;
|
||||||
|
use kube::{
|
||||||
|
Error,
|
||||||
|
api::{Api, AttachParams, ListParams},
|
||||||
|
error::DiscoveryError,
|
||||||
|
runtime::reflector::Lookup,
|
||||||
|
};
|
||||||
|
use log::debug;
|
||||||
|
use tokio::io::AsyncReadExt;
|
||||||
|
use tokio::time::sleep;
|
||||||
|
|
||||||
|
use crate::client::K8sClient;
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
|
||||||
|
let api: Api<Pod> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
api.get_opt(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn wait_for_pod_ready(
|
||||||
|
&self,
|
||||||
|
pod_name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let mut elapsed = 0u64;
|
||||||
|
let interval = 5u64;
|
||||||
|
let timeout_secs = 120u64;
|
||||||
|
loop {
|
||||||
|
if let Some(p) = self.get_pod(pod_name, namespace).await? {
|
||||||
|
if let Some(phase) = p.status.and_then(|s| s.phase) {
|
||||||
|
if phase.to_lowercase() == "running" {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if elapsed >= timeout_secs {
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Pod '{}' in '{}' did not become ready within {timeout_secs}s",
|
||||||
|
pod_name,
|
||||||
|
namespace.unwrap_or("<default>"),
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
sleep(Duration::from_secs(interval)).await;
|
||||||
|
elapsed += interval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
|
||||||
|
/// logs. Used internally by node operations.
|
||||||
|
pub(crate) async fn wait_for_pod_completion(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: &str,
|
||||||
|
) -> Result<String, Error> {
|
||||||
|
let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
|
||||||
|
let poll_interval = Duration::from_secs(2);
|
||||||
|
for _ in 0..60 {
|
||||||
|
sleep(poll_interval).await;
|
||||||
|
let p = api.get(name).await?;
|
||||||
|
match p.status.and_then(|s| s.phase).as_deref() {
|
||||||
|
Some("Succeeded") => {
|
||||||
|
let logs = api
|
||||||
|
.logs(name, &Default::default())
|
||||||
|
.await
|
||||||
|
.unwrap_or_default();
|
||||||
|
debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
|
||||||
|
return Ok(logs);
|
||||||
|
}
|
||||||
|
Some("Failed") => {
|
||||||
|
let logs = api
|
||||||
|
.logs(name, &Default::default())
|
||||||
|
.await
|
||||||
|
.unwrap_or_default();
|
||||||
|
debug!("Pod {namespace}/{name} failed. Logs: {logs}");
|
||||||
|
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Pod '{name}' failed.\n{logs}"
|
||||||
|
))));
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||||
|
"Timed out waiting for pod '{name}'"
|
||||||
|
))))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a command in the first pod matching `{label}={name}`.
|
||||||
|
pub async fn exec_app_capture_output(
|
||||||
|
&self,
|
||||||
|
name: String,
|
||||||
|
label: String,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
command: Vec<&str>,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
let api: Api<Pod> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
let pod_list = api
|
||||||
|
.list(&ListParams::default().labels(&format!("{label}={name}")))
|
||||||
|
.await
|
||||||
|
.expect("Failed to list pods");
|
||||||
|
|
||||||
|
let pod_name = pod_list
|
||||||
|
.items
|
||||||
|
.first()
|
||||||
|
.expect("No matching pod")
|
||||||
|
.name()
|
||||||
|
.expect("Pod has no name")
|
||||||
|
.into_owned();
|
||||||
|
|
||||||
|
match api
|
||||||
|
.exec(
|
||||||
|
&pod_name,
|
||||||
|
command,
|
||||||
|
&AttachParams::default().stdout(true).stderr(true),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Err(e) => Err(e.to_string()),
|
||||||
|
Ok(mut process) => {
|
||||||
|
let status = process
|
||||||
|
.take_status()
|
||||||
|
.expect("No status handle")
|
||||||
|
.await
|
||||||
|
.expect("Status channel closed");
|
||||||
|
|
||||||
|
if let Some(s) = status.status {
|
||||||
|
let mut buf = String::new();
|
||||||
|
if let Some(mut stdout) = process.stdout() {
|
||||||
|
stdout
|
||||||
|
.read_to_string(&mut buf)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Failed to read stdout: {e}"))?;
|
||||||
|
}
|
||||||
|
debug!("exec status: {} - {:?}", s, status.details);
|
||||||
|
if s == "Success" { Ok(buf) } else { Err(s) }
|
||||||
|
} else {
|
||||||
|
Err("No inner status from pod exec".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a command in the first pod matching
|
||||||
|
/// `app.kubernetes.io/name={name}`.
|
||||||
|
pub async fn exec_app(
|
||||||
|
&self,
|
||||||
|
name: String,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
command: Vec<&str>,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let api: Api<Pod> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
let pod_list = api
|
||||||
|
.list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
|
||||||
|
.await
|
||||||
|
.expect("Failed to list pods");
|
||||||
|
|
||||||
|
let pod_name = pod_list
|
||||||
|
.items
|
||||||
|
.first()
|
||||||
|
.expect("No matching pod")
|
||||||
|
.name()
|
||||||
|
.expect("Pod has no name")
|
||||||
|
.into_owned();
|
||||||
|
|
||||||
|
match api.exec(&pod_name, command, &AttachParams::default()).await {
|
||||||
|
Err(e) => Err(e.to_string()),
|
||||||
|
Ok(mut process) => {
|
||||||
|
let status = process
|
||||||
|
.take_status()
|
||||||
|
.expect("No status handle")
|
||||||
|
.await
|
||||||
|
.expect("Status channel closed");
|
||||||
|
|
||||||
|
if let Some(s) = status.status {
|
||||||
|
debug!("exec status: {} - {:?}", s, status.details);
|
||||||
|
if s == "Success" { Ok(()) } else { Err(s) }
|
||||||
|
} else {
|
||||||
|
Err("No inner status from pod exec".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
316
harmony-k8s/src/resources.rs
Normal file
316
harmony-k8s/src/resources.rs
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use k8s_openapi::api::{
|
||||||
|
apps::v1::Deployment,
|
||||||
|
core::v1::{Node, ServiceAccount},
|
||||||
|
};
|
||||||
|
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
|
||||||
|
use kube::api::ApiResource;
|
||||||
|
use kube::{
|
||||||
|
Error, Resource,
|
||||||
|
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
|
||||||
|
runtime::conditions,
|
||||||
|
runtime::wait::await_condition,
|
||||||
|
};
|
||||||
|
use log::debug;
|
||||||
|
use serde::de::DeserializeOwned;
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use crate::client::K8sClient;
|
||||||
|
use crate::types::ScopeResolver;
|
||||||
|
|
||||||
|
impl K8sClient {
|
||||||
|
pub async fn has_healthy_deployment_with_label(
|
||||||
|
&self,
|
||||||
|
namespace: &str,
|
||||||
|
label_selector: &str,
|
||||||
|
) -> Result<bool, Error> {
|
||||||
|
let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
|
||||||
|
let list = api
|
||||||
|
.list(&ListParams::default().labels(label_selector))
|
||||||
|
.await?;
|
||||||
|
for d in list.items {
|
||||||
|
let available = d
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.available_replicas)
|
||||||
|
.unwrap_or(0);
|
||||||
|
if available > 0 {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
|
||||||
|
if conds
|
||||||
|
.iter()
|
||||||
|
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||||
|
{
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_namespaces_with_healthy_deployments(
|
||||||
|
&self,
|
||||||
|
label_selector: &str,
|
||||||
|
) -> Result<Vec<String>, Error> {
|
||||||
|
let api: Api<Deployment> = Api::all(self.client.clone());
|
||||||
|
let list = api
|
||||||
|
.list(&ListParams::default().labels(label_selector))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut healthy_ns: HashMap<String, bool> = HashMap::new();
|
||||||
|
for d in list.items {
|
||||||
|
let ns = match d.metadata.namespace.clone() {
|
||||||
|
Some(n) => n,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
let available = d
|
||||||
|
.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.available_replicas)
|
||||||
|
.unwrap_or(0);
|
||||||
|
let is_healthy = if available > 0 {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
d.status
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.conditions.as_ref())
|
||||||
|
.map(|c| {
|
||||||
|
c.iter()
|
||||||
|
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||||
|
})
|
||||||
|
.unwrap_or(false)
|
||||||
|
};
|
||||||
|
if is_healthy {
|
||||||
|
healthy_ns.insert(ns, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(healthy_ns.into_keys().collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_controller_service_account_name(
|
||||||
|
&self,
|
||||||
|
ns: &str,
|
||||||
|
) -> Result<Option<String>, Error> {
|
||||||
|
let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
|
||||||
|
let list = api
|
||||||
|
.list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
|
||||||
|
.await?;
|
||||||
|
if let Some(dep) = list.items.first() {
|
||||||
|
if let Some(sa) = dep
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.template.spec.as_ref())
|
||||||
|
.and_then(|s| s.service_account_name.clone())
|
||||||
|
{
|
||||||
|
return Ok(Some(sa));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
|
||||||
|
let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
|
||||||
|
let ar = ApiResource::from_gvk(&gvk);
|
||||||
|
let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
|
||||||
|
let list = api.list(&ListParams::default()).await?;
|
||||||
|
Ok(list
|
||||||
|
.items
|
||||||
|
.into_iter()
|
||||||
|
.map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
|
||||||
|
let sa_user = format!("system:serviceaccount:{ns}:{sa}");
|
||||||
|
for crb in self.list_clusterrolebindings_json().await? {
|
||||||
|
if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
|
||||||
|
for subj in subjects {
|
||||||
|
let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
|
||||||
|
|| (kind == "User" && name == sa_user)
|
||||||
|
{
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
|
||||||
|
let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
|
||||||
|
let crds = api
|
||||||
|
.list(&ListParams::default().fields(&format!("metadata.name={name}")))
|
||||||
|
.await?;
|
||||||
|
Ok(!crds.items.is_empty())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
|
||||||
|
Api::namespaced(self.client.clone(), namespace)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_resource_json_value(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
gvk: &GroupVersionKind,
|
||||||
|
) -> Result<DynamicObject, Error> {
|
||||||
|
let ar = ApiResource::from_gvk(gvk);
|
||||||
|
let api: Api<DynamicObject> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||||
|
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||||
|
};
|
||||||
|
api.get(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_secret_json_value(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
) -> Result<DynamicObject, Error> {
|
||||||
|
self.get_resource_json_value(
|
||||||
|
name,
|
||||||
|
namespace,
|
||||||
|
&GroupVersionKind {
|
||||||
|
group: String::new(),
|
||||||
|
version: "v1".to_string(),
|
||||||
|
kind: "Secret".to_string(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_deployment(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
) -> Result<Option<Deployment>, Error> {
|
||||||
|
let api: Api<Deployment> = match namespace {
|
||||||
|
Some(ns) => {
|
||||||
|
debug!("Getting namespaced deployment '{name}' in '{ns}'");
|
||||||
|
Api::namespaced(self.client.clone(), ns)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
debug!("Getting deployment '{name}' in default namespace");
|
||||||
|
Api::default_namespaced(self.client.clone())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
api.get_opt(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scale_deployment(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
replicas: u32,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let api: Api<Deployment> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
use kube::api::{Patch, PatchParams};
|
||||||
|
use serde_json::json;
|
||||||
|
let patch = json!({ "spec": { "replicas": replicas } });
|
||||||
|
api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete_deployment(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let api: Api<Deployment> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
api.delete(name, &kube::api::DeleteParams::default())
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn wait_until_deployment_ready(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
timeout: Option<Duration>,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let api: Api<Deployment> = match namespace {
|
||||||
|
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(self.client.clone()),
|
||||||
|
};
|
||||||
|
let timeout = timeout.unwrap_or(Duration::from_secs(120));
|
||||||
|
let establish = await_condition(api, name, conditions::is_deployment_completed());
|
||||||
|
tokio::time::timeout(timeout, establish)
|
||||||
|
.await
|
||||||
|
.map(|_| ())
|
||||||
|
.map_err(|_| "Timed out waiting for deployment".to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets a single named resource, using the correct API scope for `K`.
|
||||||
|
pub async fn get_resource<K>(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
) -> Result<Option<K>, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||||
|
<K as Resource>::Scope: ScopeResolver<K>,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
let api: Api<K> =
|
||||||
|
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||||
|
api.get_opt(name).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_resources<K>(
|
||||||
|
&self,
|
||||||
|
namespace: Option<&str>,
|
||||||
|
list_params: Option<ListParams>,
|
||||||
|
) -> Result<ObjectList<K>, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||||
|
<K as Resource>::Scope: ScopeResolver<K>,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
let api: Api<K> =
|
||||||
|
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||||
|
api.list(&list_params.unwrap_or_default()).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
Api::<K>::all(self.client.clone())
|
||||||
|
.list(&ListParams::default().labels(labels))
|
||||||
|
.await
|
||||||
|
.map(|l| l.items)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
|
||||||
|
where
|
||||||
|
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||||
|
<K as Resource>::Scope: ScopeResolver<K>,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
Api::<K>::all(self.client.clone())
|
||||||
|
.list(&Default::default())
|
||||||
|
.await
|
||||||
|
.map(|l| l.items)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_nodes(
|
||||||
|
&self,
|
||||||
|
list_params: Option<ListParams>,
|
||||||
|
) -> Result<ObjectList<Node>, Error> {
|
||||||
|
self.list_resources(None, list_params).await
|
||||||
|
}
|
||||||
|
}
|
||||||
100
harmony-k8s/src/types.rs
Normal file
100
harmony-k8s/src/types.rs
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
|
||||||
|
use kube::{Api, Client, Resource};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
/// Which Kubernetes distribution is running. Detected once at runtime via
|
||||||
|
/// [`crate::discovery::K8sClient::get_k8s_distribution`].
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||||
|
pub enum KubernetesDistribution {
|
||||||
|
Default,
|
||||||
|
OpenshiftFamily,
|
||||||
|
K3sFamily,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A file to be written to a node's filesystem.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct NodeFile {
|
||||||
|
/// Absolute path on the host where the file should be written.
|
||||||
|
pub path: String,
|
||||||
|
/// Content of the file.
|
||||||
|
pub content: String,
|
||||||
|
/// UNIX permissions (e.g. `0o600`).
|
||||||
|
pub mode: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DrainOptions {
|
||||||
|
/// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
|
||||||
|
/// Equivalent to `kubectl drain --delete-emptydir-data`.
|
||||||
|
pub delete_emptydir_data: bool,
|
||||||
|
/// Silently skip DaemonSet-managed pods instead of blocking the drain.
|
||||||
|
/// Equivalent to `kubectl drain --ignore-daemonsets`.
|
||||||
|
pub ignore_daemonsets: bool,
|
||||||
|
/// Maximum wall-clock time to wait for all evictions to complete.
|
||||||
|
pub timeout: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DrainOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
delete_emptydir_data: false,
|
||||||
|
ignore_daemonsets: true,
|
||||||
|
timeout: Duration::from_secs(1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DrainOptions {
|
||||||
|
pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
|
||||||
|
Self {
|
||||||
|
delete_emptydir_data: true,
|
||||||
|
ignore_daemonsets: true,
|
||||||
|
..Self::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
|
||||||
|
/// resource already exists (or does not).
|
||||||
|
pub enum WriteMode {
|
||||||
|
/// Server-side apply; create if absent, update if present (default).
|
||||||
|
CreateOrUpdate,
|
||||||
|
/// POST only; return an error if the resource already exists.
|
||||||
|
Create,
|
||||||
|
/// Server-side apply only; return an error if the resource does not exist.
|
||||||
|
Update,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Scope resolution trait ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Resolves the correct [`kube::Api`] for a resource type based on its scope
|
||||||
|
/// (cluster-wide vs. namespace-scoped).
|
||||||
|
pub trait ScopeResolver<K: Resource> {
|
||||||
|
fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K> ScopeResolver<K> for ClusterResourceScope
|
||||||
|
where
|
||||||
|
K: Resource<Scope = ClusterResourceScope>,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
|
||||||
|
Api::all(client.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K> ScopeResolver<K> for NamespaceResourceScope
|
||||||
|
where
|
||||||
|
K: Resource<Scope = NamespaceResourceScope>,
|
||||||
|
<K as Resource>::DynamicType: Default,
|
||||||
|
{
|
||||||
|
fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
|
||||||
|
match ns {
|
||||||
|
Some(ns) => Api::namespaced(client.clone(), ns),
|
||||||
|
None => Api::default_namespaced(client.clone()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -21,6 +21,8 @@ semver = "1.0.23"
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
|
tokio-retry.workspace = true
|
||||||
|
tokio-util.workspace = true
|
||||||
derive-new.workspace = true
|
derive-new.workspace = true
|
||||||
log.workspace = true
|
log.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
@@ -31,6 +33,7 @@ opnsense-config-xml = { path = "../opnsense-config-xml" }
|
|||||||
harmony_macros = { path = "../harmony_macros" }
|
harmony_macros = { path = "../harmony_macros" }
|
||||||
harmony_types = { path = "../harmony_types" }
|
harmony_types = { path = "../harmony_types" }
|
||||||
harmony_execution = { path = "../harmony_execution" }
|
harmony_execution = { path = "../harmony_execution" }
|
||||||
|
harmony-k8s = { path = "../harmony-k8s" }
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
kube = { workspace = true, features = ["derive"] }
|
kube = { workspace = true, features = ["derive"] }
|
||||||
@@ -60,7 +63,6 @@ temp-dir = "0.1.14"
|
|||||||
dyn-clone = "1.0.19"
|
dyn-clone = "1.0.19"
|
||||||
similar.workspace = true
|
similar.workspace = true
|
||||||
futures-util = "0.3.31"
|
futures-util = "0.3.31"
|
||||||
tokio-util = "0.7.15"
|
|
||||||
strum = { version = "0.27.1", features = ["derive"] }
|
strum = { version = "0.27.1", features = ["derive"] }
|
||||||
tempfile.workspace = true
|
tempfile.workspace = true
|
||||||
serde_with = "3.14.0"
|
serde_with = "3.14.0"
|
||||||
@@ -80,7 +82,7 @@ sqlx.workspace = true
|
|||||||
inquire.workspace = true
|
inquire.workspace = true
|
||||||
brocade = { path = "../brocade" }
|
brocade = { path = "../brocade" }
|
||||||
option-ext = "0.2.0"
|
option-ext = "0.2.0"
|
||||||
tokio-retry = "0.3.0"
|
rand.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
pretty_assertions.workspace = true
|
pretty_assertions.workspace = true
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ use std::error::Error;
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use derive_new::new;
|
use derive_new::new;
|
||||||
|
|
||||||
use crate::inventory::HostRole;
|
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
|
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use harmony_macros::ip;
|
use harmony_macros::ip;
|
||||||
use harmony_types::{
|
use harmony_types::{
|
||||||
id::Id,
|
id::Id,
|
||||||
@@ -8,7 +9,7 @@ use harmony_types::{
|
|||||||
use log::debug;
|
use log::debug;
|
||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
use crate::topology::PxeOptions;
|
use crate::topology::{HelmCommand, PxeOptions};
|
||||||
use crate::{data::FileContent, executors::ExecutorError, topology::node_exporter::NodeExporter};
|
use crate::{data::FileContent, executors::ExecutorError, topology::node_exporter::NodeExporter};
|
||||||
use crate::{infra::network_manager::OpenShiftNmStateNetworkManager, topology::PortConfig};
|
use crate::{infra::network_manager::OpenShiftNmStateNetworkManager, topology::PortConfig};
|
||||||
|
|
||||||
@@ -16,9 +17,12 @@ use super::{
|
|||||||
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
|
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
|
||||||
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
|
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
|
||||||
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
|
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
|
||||||
SwitchError, TftpServer, Topology, k8s::K8sClient,
|
SwitchError, TftpServer, Topology,
|
||||||
|
};
|
||||||
|
use std::{
|
||||||
|
process::Command,
|
||||||
|
sync::{Arc, OnceLock},
|
||||||
};
|
};
|
||||||
use std::sync::{Arc, OnceLock};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct HAClusterTopology {
|
pub struct HAClusterTopology {
|
||||||
@@ -52,6 +56,30 @@ impl Topology for HAClusterTopology {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl HelmCommand for HAClusterTopology {
|
||||||
|
fn get_helm_command(&self) -> Command {
|
||||||
|
let mut cmd = Command::new("helm");
|
||||||
|
if let Some(k) = &self.kubeconfig {
|
||||||
|
cmd.args(["--kubeconfig", k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME we should support context anywhere there is a k8sclient
|
||||||
|
// This likely belongs in the k8sclient itself and should be extracted to a separate
|
||||||
|
// crate
|
||||||
|
//
|
||||||
|
// I feel like helm could very well be a feature of this external k8s client.
|
||||||
|
//
|
||||||
|
// Same for kustomize
|
||||||
|
//
|
||||||
|
// if let Some(c) = &self.k8s_context {
|
||||||
|
// cmd.args(["--kube-context", c]);
|
||||||
|
// }
|
||||||
|
|
||||||
|
info!("Using helm command {cmd:?}");
|
||||||
|
cmd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl K8sclient for HAClusterTopology {
|
impl K8sclient for HAClusterTopology {
|
||||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,7 @@ use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration};
|
|||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use base64::{Engine, engine::general_purpose};
|
use base64::{Engine, engine::general_purpose};
|
||||||
|
use harmony_k8s::{K8sClient, KubernetesDistribution};
|
||||||
use harmony_types::rfc1123::Rfc1123Name;
|
use harmony_types::rfc1123::Rfc1123Name;
|
||||||
use k8s_openapi::api::{
|
use k8s_openapi::api::{
|
||||||
core::v1::{Pod, Secret},
|
core::v1::{Pod, Secret},
|
||||||
@@ -58,7 +59,6 @@ use crate::{
|
|||||||
use super::super::{
|
use super::super::{
|
||||||
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
|
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
|
||||||
PreparationOutcome, Topology,
|
PreparationOutcome, Topology,
|
||||||
k8s::K8sClient,
|
|
||||||
oberservability::monitoring::AlertReceiver,
|
oberservability::monitoring::AlertReceiver,
|
||||||
tenant::{
|
tenant::{
|
||||||
TenantConfig, TenantManager,
|
TenantConfig, TenantManager,
|
||||||
@@ -76,13 +76,6 @@ struct K8sState {
|
|||||||
message: String,
|
message: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
|
||||||
pub enum KubernetesDistribution {
|
|
||||||
OpenshiftFamily,
|
|
||||||
K3sFamily,
|
|
||||||
Default,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
enum K8sSource {
|
enum K8sSource {
|
||||||
LocalK3d,
|
LocalK3d,
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
interpret::Outcome,
|
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::postgresql::{
|
modules::postgresql::{
|
||||||
K8sPostgreSQLScore,
|
K8sPostgreSQLScore,
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ pub enum SSL {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||||
pub enum HealthCheck {
|
pub enum HealthCheck {
|
||||||
HTTP(String, HttpMethod, HttpStatusCode, SSL),
|
/// HTTP(None, "/healthz/ready", HttpMethod::GET, HttpStatusCode::Success2xx, SSL::Disabled)
|
||||||
|
HTTP(Option<u16>, String, HttpMethod, HttpStatusCode, SSL),
|
||||||
TCP(Option<u16>),
|
TCP(Option<u16>),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ pub mod tenant;
|
|||||||
use derive_new::new;
|
use derive_new::new;
|
||||||
pub use k8s_anywhere::*;
|
pub use k8s_anywhere::*;
|
||||||
pub use localhost::*;
|
pub use localhost::*;
|
||||||
pub mod k8s;
|
|
||||||
mod load_balancer;
|
mod load_balancer;
|
||||||
pub mod router;
|
pub mod router;
|
||||||
mod tftp;
|
mod tftp;
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use std::{
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use brocade::PortOperatingMode;
|
use brocade::PortOperatingMode;
|
||||||
use derive_new::new;
|
use derive_new::new;
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use harmony_types::{
|
use harmony_types::{
|
||||||
id::Id,
|
id::Id,
|
||||||
net::{IpAddress, MacAddress},
|
net::{IpAddress, MacAddress},
|
||||||
@@ -18,7 +19,7 @@ use serde::Serialize;
|
|||||||
|
|
||||||
use crate::executors::ExecutorError;
|
use crate::executors::ExecutorError;
|
||||||
|
|
||||||
use super::{LogicalHost, k8s::K8sClient};
|
use super::LogicalHost;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct DHCPStaticEntry {
|
pub struct DHCPStaticEntry {
|
||||||
|
|||||||
@@ -1,10 +1,8 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::{
|
use crate::executors::ExecutorError;
|
||||||
executors::ExecutorError,
|
|
||||||
topology::k8s::{ApplyStrategy, K8sClient},
|
|
||||||
};
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use k8s_openapi::{
|
use k8s_openapi::{
|
||||||
api::{
|
api::{
|
||||||
core::v1::{LimitRange, Namespace, ResourceQuota},
|
core::v1::{LimitRange, Namespace, ResourceQuota},
|
||||||
@@ -14,7 +12,7 @@ use k8s_openapi::{
|
|||||||
},
|
},
|
||||||
apimachinery::pkg::util::intstr::IntOrString,
|
apimachinery::pkg::util::intstr::IntOrString,
|
||||||
};
|
};
|
||||||
use kube::{Resource, api::DynamicObject};
|
use kube::Resource;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use serde::de::DeserializeOwned;
|
use serde::de::DeserializeOwned;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
@@ -59,7 +57,6 @@ impl K8sTenantManager {
|
|||||||
) -> Result<K, ExecutorError>
|
) -> Result<K, ExecutorError>
|
||||||
where
|
where
|
||||||
<K as kube::Resource>::DynamicType: Default,
|
<K as kube::Resource>::DynamicType: Default,
|
||||||
<K as kube::Resource>::Scope: ApplyStrategy<K>,
|
|
||||||
{
|
{
|
||||||
self.apply_labels(&mut resource, config);
|
self.apply_labels(&mut resource, config);
|
||||||
self.k8s_client
|
self.k8s_client
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use std::{
|
|||||||
|
|
||||||
use askama::Template;
|
use askama::Template;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use harmony_k8s::{DrainOptions, K8sClient, NodeFile};
|
||||||
use harmony_types::id::Id;
|
use harmony_types::id::Id;
|
||||||
use k8s_openapi::api::core::v1::Node;
|
use k8s_openapi::api::core::v1::Node;
|
||||||
use kube::{
|
use kube::{
|
||||||
@@ -15,10 +16,7 @@ use log::{debug, info, warn};
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
modules::okd::crd::nmstate,
|
modules::okd::crd::nmstate,
|
||||||
topology::{
|
topology::{HostNetworkConfig, NetworkError, NetworkManager},
|
||||||
HostNetworkConfig, NetworkError, NetworkManager,
|
|
||||||
k8s::{DrainOptions, K8sClient, NodeFile},
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// NetworkManager bond configuration template
|
/// NetworkManager bond configuration template
|
||||||
|
|||||||
@@ -216,7 +216,15 @@ pub(crate) fn get_health_check_for_backend(
|
|||||||
SSL::Other(other.to_string())
|
SSL::Other(other.to_string())
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Some(HealthCheck::HTTP(path, method, status_code, ssl))
|
|
||||||
|
let port = haproxy_health_check
|
||||||
|
.checkport
|
||||||
|
.content_string()
|
||||||
|
.parse::<u16>()
|
||||||
|
.ok();
|
||||||
|
debug!("Found haproxy healthcheck port {port:?}");
|
||||||
|
|
||||||
|
Some(HealthCheck::HTTP(port, path, method, status_code, ssl))
|
||||||
}
|
}
|
||||||
_ => panic!("Received unsupported health check type {}", uppercase),
|
_ => panic!("Received unsupported health check type {}", uppercase),
|
||||||
}
|
}
|
||||||
@@ -251,7 +259,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
|||||||
// frontend points to backend
|
// frontend points to backend
|
||||||
let healthcheck = if let Some(health_check) = &service.health_check {
|
let healthcheck = if let Some(health_check) = &service.health_check {
|
||||||
match health_check {
|
match health_check {
|
||||||
HealthCheck::HTTP(path, http_method, _http_status_code, ssl) => {
|
HealthCheck::HTTP(port, path, http_method, _http_status_code, ssl) => {
|
||||||
let ssl: MaybeString = match ssl {
|
let ssl: MaybeString = match ssl {
|
||||||
SSL::SSL => "ssl".into(),
|
SSL::SSL => "ssl".into(),
|
||||||
SSL::SNI => "sslni".into(),
|
SSL::SNI => "sslni".into(),
|
||||||
@@ -267,6 +275,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
|||||||
http_uri: path.clone().into(),
|
http_uri: path.clone().into(),
|
||||||
interval: "2s".to_string(),
|
interval: "2s".to_string(),
|
||||||
ssl,
|
ssl,
|
||||||
|
checkport: MaybeString::from(port.map(|p| p.to_string())),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use log::{debug, info, trace};
|
use log::{debug, info};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use harmony_macros::hurl;
|
use harmony_macros::hurl;
|
||||||
use log::{debug, info, trace, warn};
|
use log::{debug, info, trace, warn};
|
||||||
use non_blank_string_rs::NonBlankString;
|
use non_blank_string_rs::NonBlankString;
|
||||||
@@ -14,7 +15,7 @@ use crate::{
|
|||||||
helm::chart::{HelmChartScore, HelmRepository},
|
helm::chart::{HelmChartScore, HelmRepository},
|
||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress, k8s::K8sClient},
|
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress},
|
||||||
};
|
};
|
||||||
use harmony_types::id::Id;
|
use harmony_types::id::Id;
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
|
|
||||||
use crate::{interpret::InterpretError, topology::k8s::K8sClient};
|
use crate::interpret::InterpretError;
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
pub enum ArgoScope {
|
pub enum ArgoScope {
|
||||||
|
|||||||
@@ -44,6 +44,12 @@ pub struct BrocadeSwitchAuth {
|
|||||||
pub password: String,
|
pub password: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl BrocadeSwitchAuth {
|
||||||
|
pub fn user_pass(username: String, password: String) -> Self {
|
||||||
|
Self { username, password }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
|
#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
|
||||||
pub struct BrocadeSnmpAuth {
|
pub struct BrocadeSnmpAuth {
|
||||||
pub username: String,
|
pub username: String,
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use harmony_k8s::K8sClient;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
@@ -11,7 +12,7 @@ use crate::{
|
|||||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
topology::{K8sclient, Topology},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ impl<T: Topology> Interpret<T> for DiscoverHostForRoleInterpret {
|
|||||||
topology: &T,
|
topology: &T,
|
||||||
) -> Result<Outcome, InterpretError> {
|
) -> Result<Outcome, InterpretError> {
|
||||||
info!(
|
info!(
|
||||||
"Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:25000/inventory`"
|
"Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:8080/inventory`"
|
||||||
);
|
);
|
||||||
LaunchDiscoverInventoryAgentScore {
|
LaunchDiscoverInventoryAgentScore {
|
||||||
discovery_timeout: None,
|
discovery_timeout: None,
|
||||||
@@ -58,8 +58,6 @@ impl<T: Topology> Interpret<T> for DiscoverHostForRoleInterpret {
|
|||||||
let host_repo = InventoryRepositoryFactory::build().await?;
|
let host_repo = InventoryRepositoryFactory::build().await?;
|
||||||
|
|
||||||
let mut assigned_hosts = 0;
|
let mut assigned_hosts = 0;
|
||||||
// let hosts_for_role = host_repo.get_hosts_for_role(&self.score.role);
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let all_hosts = host_repo.get_all_hosts().await?;
|
let all_hosts = host_repo.get_all_hosts().await?;
|
||||||
|
|
||||||
|
|||||||
@@ -54,6 +54,12 @@ pub enum HarmonyDiscoveryStrategy {
|
|||||||
SUBNET { cidr: cidr::Ipv4Cidr, port: u16 },
|
SUBNET { cidr: cidr::Ipv4Cidr, port: u16 },
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for HarmonyDiscoveryStrategy {
|
||||||
|
fn default() -> Self {
|
||||||
|
HarmonyDiscoveryStrategy::MDNS
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<T: Topology> Interpret<T> for DiscoverInventoryAgentInterpret {
|
impl<T: Topology> Interpret<T> for DiscoverInventoryAgentInterpret {
|
||||||
async fn execute(
|
async fn execute(
|
||||||
|
|||||||
@@ -3,7 +3,8 @@ use std::sync::Arc;
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
|
|
||||||
use crate::topology::{FailoverTopology, K8sclient, k8s::K8sClient};
|
use crate::topology::{FailoverTopology, K8sclient};
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<T: K8sclient> K8sclient for FailoverTopology<T> {
|
impl<T: K8sclient> K8sclient for FailoverTopology<T> {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use k8s_openapi::NamespaceResourceScope;
|
use k8s_openapi::ResourceScope;
|
||||||
use kube::Resource;
|
use kube::Resource;
|
||||||
use log::info;
|
use log::info;
|
||||||
use serde::{Serialize, de::DeserializeOwned};
|
use serde::{Serialize, de::DeserializeOwned};
|
||||||
@@ -29,7 +29,7 @@ impl<K: Resource + std::fmt::Debug> K8sResourceScore<K> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<
|
impl<
|
||||||
K: Resource<Scope = NamespaceResourceScope>
|
K: Resource<Scope: ResourceScope>
|
||||||
+ std::fmt::Debug
|
+ std::fmt::Debug
|
||||||
+ Sync
|
+ Sync
|
||||||
+ DeserializeOwned
|
+ DeserializeOwned
|
||||||
@@ -61,7 +61,7 @@ pub struct K8sResourceInterpret<K: Resource + std::fmt::Debug + Sync + Send> {
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<
|
impl<
|
||||||
K: Resource<Scope = NamespaceResourceScope>
|
K: Resource<Scope: ResourceScope>
|
||||||
+ Clone
|
+ Clone
|
||||||
+ std::fmt::Debug
|
+ std::fmt::Debug
|
||||||
+ DeserializeOwned
|
+ DeserializeOwned
|
||||||
@@ -109,7 +109,7 @@ where
|
|||||||
topology
|
topology
|
||||||
.k8s_client()
|
.k8s_client()
|
||||||
.await
|
.await
|
||||||
.expect("Environment should provide enough information to instanciate a client")
|
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
|
||||||
.apply_many(&self.score.resource, self.score.namespace.as_deref())
|
.apply_many(&self.score.resource, self.score.namespace.as_deref())
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
|||||||
@@ -15,10 +15,13 @@ pub mod load_balancer;
|
|||||||
pub mod monitoring;
|
pub mod monitoring;
|
||||||
pub mod nats;
|
pub mod nats;
|
||||||
pub mod network;
|
pub mod network;
|
||||||
|
pub mod node_health;
|
||||||
pub mod okd;
|
pub mod okd;
|
||||||
|
pub mod openbao;
|
||||||
pub mod opnsense;
|
pub mod opnsense;
|
||||||
pub mod postgresql;
|
pub mod postgresql;
|
||||||
pub mod prometheus;
|
pub mod prometheus;
|
||||||
pub mod storage;
|
pub mod storage;
|
||||||
pub mod tenant;
|
pub mod tenant;
|
||||||
pub mod tftp;
|
pub mod tftp;
|
||||||
|
pub mod zitadel;
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: observability
|
||||||
|
labels:
|
||||||
|
openshift.io/cluster-monitoring: "true"
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: cluster-grafana-sa
|
||||||
|
namespace: observability
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: grafana-prometheus-api-access
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- monitoring.coreos.com
|
||||||
|
resources:
|
||||||
|
- prometheuses/api
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: grafana-prometheus-api-access-binding
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: grafana-prometheus-api-access
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: cluster-grafana-sa
|
||||||
|
namespace: observability
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: grafana-cluster-monitoring-view
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: cluster-monitoring-view
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: cluster-grafana-sa
|
||||||
|
namespace: observability
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: Grafana
|
||||||
|
metadata:
|
||||||
|
name: cluster-grafana
|
||||||
|
namespace: observability
|
||||||
|
labels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
spec:
|
||||||
|
serviceAccountName: cluster-grafana-sa
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
|
||||||
|
config:
|
||||||
|
log:
|
||||||
|
mode: console
|
||||||
|
|
||||||
|
security:
|
||||||
|
admin_user: admin
|
||||||
|
admin_password: paul
|
||||||
|
|
||||||
|
users:
|
||||||
|
viewers_can_edit: "false"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
disable_login_form: "false"
|
||||||
|
|
||||||
|
auth.anonymous:
|
||||||
|
enabled: "true"
|
||||||
|
org_role: Viewer
|
||||||
|
|
||||||
|
deployment:
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: grafana
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: 1
|
||||||
|
memory: 2Gi
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: grafana-prometheus-token
|
||||||
|
namespace: observability
|
||||||
|
annotations:
|
||||||
|
kubernetes.io/service-account.name: cluster-grafana-sa
|
||||||
|
type: kubernetes.io/service-account-token
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDatasource
|
||||||
|
metadata:
|
||||||
|
name: prometheus-cluster
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
valuesFrom:
|
||||||
|
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-prometheus-token
|
||||||
|
key: token
|
||||||
|
datasource:
|
||||||
|
name: Prometheus-Cluster
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||||
|
isDefault: true
|
||||||
|
jsonData:
|
||||||
|
httpHeaderName1: "Authorization"
|
||||||
|
tlsSkipVerify: true
|
||||||
|
timeInterval: "30s"
|
||||||
|
secureJsonData:
|
||||||
|
httpHeaderValue1: "Bearer ${token}"
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
apiVersion: route.openshift.io/v1
|
||||||
|
kind: Route
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
to:
|
||||||
|
kind: Service
|
||||||
|
name: cluster-grafana-service
|
||||||
|
port:
|
||||||
|
targetPort: 3000
|
||||||
|
tls:
|
||||||
|
termination: edge
|
||||||
|
insecureEdgeTerminationPolicy: Redirect
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: cluster-overview
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Cluster Overview",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus-Cluster"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Running Pods",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus-Cluster"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Cluster CPU Usage (%)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus-Cluster"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Cluster Memory Usage (%)",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus-Cluster"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,769 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-cluster-overview
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Cluster Overview",
|
||||||
|
"uid": "okd-cluster-overview",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 2,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "cluster", "overview"],
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Not Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Running Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Pending Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Failed Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "CrashLoopBackOff",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Critical Alerts",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Warning Alerts",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 10 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "CPU"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Memory"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 75 },
|
||||||
|
{ "color": "red", "value": 90 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Root Disk Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Disk"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "etcd Has Leader",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "min(etcd_server_has_leader)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "NO LEADER", "color": "red" },
|
||||||
|
"1": { "text": "LEADER OK", "color": "green" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "?"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "API Servers Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(up{job=\"apiserver\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 2 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "etcd Members Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(up{job=\"etcd\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 2 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Operators Degraded",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Network Traffic — Cluster Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Receive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Transmit"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Receive" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Transmit" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Pod Phases Over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Running"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Pending"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "Failed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||||
|
"refId": "D",
|
||||||
|
"legendFormat": "Unknown"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 15,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Running" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Pending" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Failed" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unknown" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["lastNotNull"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,637 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-node-health
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Node Health",
|
||||||
|
"uid": "okd-node-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 2,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "node", "health"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "node",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Node",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Total Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Not Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Memory Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Disk Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "PID Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Unschedulable",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Kubelet Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Node Conditions",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||||
|
"refId": "E",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "labelsToFields",
|
||||||
|
"options": { "mode": "columns" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "node", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"Time 2": true,
|
||||||
|
"Time 3": true,
|
||||||
|
"Time 4": true,
|
||||||
|
"Time 5": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"node": "Node",
|
||||||
|
"Value #A": "Ready",
|
||||||
|
"Value #B": "Mem Pressure",
|
||||||
|
"Value #C": "Disk Pressure",
|
||||||
|
"Value #D": "PID Pressure",
|
||||||
|
"Value #E": "Unschedulable"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"node": 0,
|
||||||
|
"Value #A": 1,
|
||||||
|
"Value #B": 2,
|
||||||
|
"Value #C": 3,
|
||||||
|
"Value #D": 4,
|
||||||
|
"Value #E": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "displayMode": "color-background", "align": "center" }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Node" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "auto" },
|
||||||
|
{ "id": "custom.align", "value": "left" },
|
||||||
|
{ "id": "custom.width", "value": 200 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||||
|
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||||
|
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||||
|
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "CPU Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Memory Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Root Disk Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Root Disk Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Network Traffic per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "rx {{instance}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "tx {{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Pods per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 100 },
|
||||||
|
{ "color": "red", "value": 200 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "System Load Average (1m) per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "node_load1",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "1m \u2014 {{instance}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "node_load5",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "5m \u2014 {{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Node Uptime",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - node_boot_time_seconds",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 300 },
|
||||||
|
{ "color": "green", "value": 3600 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": false,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,783 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-workload-health
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Workload Health",
|
||||||
|
"uid": "okd-workload-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 3,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "workload", "health"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Namespace",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Total Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Running Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Deployment Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "E",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "deployment", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "deployment",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true,
|
||||||
|
"namespace 4": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"deployment": "Deployment",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Available",
|
||||||
|
"Value 3": "Unavailable",
|
||||||
|
"Value 4": "Up-to-date"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"deployment": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5,
|
||||||
|
"Value 4": 6
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": {
|
||||||
|
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Deployment" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "table",
|
||||||
|
"title": "StatefulSet Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "statefulset", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "statefulset",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"statefulset": "StatefulSet",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Current",
|
||||||
|
"Value 3": "Up-to-date"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"statefulset": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "table",
|
||||||
|
"title": "DaemonSet Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "daemonset", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "daemonset",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"daemonset": "DaemonSet",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Unavailable",
|
||||||
|
"Value 3": "Misscheduled"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"daemonset": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Pod Phase over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{phase}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "piechart",
|
||||||
|
"title": "Pod Phase — Now",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"pieType": "donut",
|
||||||
|
"tooltip": { "mode": "single" },
|
||||||
|
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Container Restarts over Time (total counter, top 10)",
|
||||||
|
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Container Total Restarts (non-zero)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"container": "Container",
|
||||||
|
"Value": "Total Restarts"
|
||||||
|
},
|
||||||
|
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cores", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "CPU — Actual vs Requested (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 150,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Memory — Actual vs Requested (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 150,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,955 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-networking
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Networking",
|
||||||
|
"uid": "okd-networking",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "networking"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Namespace",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "Bps", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "Bps", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||||
|
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "percent", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Pod Network I/O Summary",
|
||||||
|
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "pod", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true,
|
||||||
|
"namespace 4": true,
|
||||||
|
"namespace 5": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"Value": "RX Rate",
|
||||||
|
"Value 1": "TX Rate",
|
||||||
|
"Value 2": "RX Errors/s",
|
||||||
|
"Value 3": "TX Errors/s",
|
||||||
|
"Value 4": "RX Drops/s",
|
||||||
|
"Value 5": "TX Drops/s"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"pod": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5,
|
||||||
|
"Value 4": 6,
|
||||||
|
"Value 5": 7
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Pod" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "Bps" },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10000000 },
|
||||||
|
{ "color": "orange", "value": 100000000 },
|
||||||
|
{ "color": "red", "value": 500000000 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "pps" },
|
||||||
|
{ "id": "decimals", "value": 3 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "pps" },
|
||||||
|
{ "id": "decimals", "value": 3 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "orange", "value": 0.001 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{type}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||||
|
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{rcode}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": "p50"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "B", "legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "C", "legendFormat": "p99"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||||
|
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||||
|
"refId": "A", "legendFormat": "Cache Hit %"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 50 },
|
||||||
|
{ "color": "green", "value": 80 }
|
||||||
|
]},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "single" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||||
|
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||||
|
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "stat", "title": "Total Services",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Endpoint Availability",
|
||||||
|
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "endpoint", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": { "namespace 1": true },
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"endpoint": "Endpoint",
|
||||||
|
"Value": "Available",
|
||||||
|
"Value 1": "Not Ready"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"endpoint": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Available" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||||
|
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||||
|
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||||
|
"refId": "A", "legendFormat": "4xx %"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||||
|
"refId": "B", "legendFormat": "5xx %"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "Bytes In"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||||
|
"refId": "B", "legendFormat": "Bytes Out"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 36,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Router Backend Server Status",
|
||||||
|
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "haproxy_server_up",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {},
|
||||||
|
"renameByName": {
|
||||||
|
"proxy": "Backend",
|
||||||
|
"server": "Server",
|
||||||
|
"Value": "Status"
|
||||||
|
},
|
||||||
|
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Backend" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Server" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Status" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "mappings", "value": [
|
||||||
|
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||||
|
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||||
|
]},
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,607 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: storage-health
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Storage Health",
|
||||||
|
"uid": "storage-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 1,
|
||||||
|
"title": "PVC / PV Status",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 2,
|
||||||
|
"title": "Bound PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "green", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 3,
|
||||||
|
"title": "Pending PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 4,
|
||||||
|
"title": "Lost PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 5,
|
||||||
|
"title": "Bound PVs / Available PVs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Bound"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Available"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 6,
|
||||||
|
"title": "Ceph Cluster Health",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_health_status",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 2 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||||
|
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||||
|
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "value"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 7,
|
||||||
|
"title": "OSDs Up / Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Up"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Total"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "green", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 8,
|
||||||
|
"title": "Cluster Capacity",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "gauge",
|
||||||
|
"id": 9,
|
||||||
|
"title": "Ceph Cluster Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"showThresholdLabels": true,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 10,
|
||||||
|
"title": "Ceph Capacity — Total / Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_cluster_total_bytes",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Total"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Available"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto",
|
||||||
|
"orientation": "vertical"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 11,
|
||||||
|
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{storageclass}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "piechart",
|
||||||
|
"id": 12,
|
||||||
|
"title": "PVC Phase Distribution",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Bound"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Pending"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "Lost"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "color": { "mode": "palette-classic" } }
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"pieType": "pie",
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right",
|
||||||
|
"values": ["value", "percent"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 13,
|
||||||
|
"title": "Ceph Performance",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 14,
|
||||||
|
"title": "Ceph Pool IOPS (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_rd[5m])",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Read — pool {{pool_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_wr[5m])",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Write — pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 15,
|
||||||
|
"title": "Ceph Pool Throughput (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Read — pool {{pool_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Write — pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 16,
|
||||||
|
"title": "Ceph OSD & Pool Details",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 17,
|
||||||
|
"title": "Ceph Pool Space Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 18,
|
||||||
|
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_osd_up",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{ceph_daemon}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 1,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "DOWN", "index": 0 },
|
||||||
|
"1": { "text": "UP", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "basic",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 19,
|
||||||
|
"title": "Node Disk Usage",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 20,
|
||||||
|
"title": "Node Root Disk Usage Over Time (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 21,
|
||||||
|
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,744 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-etcd
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "etcd",
|
||||||
|
"uid": "okd-etcd",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "etcd"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Instance",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||||
|
"description": "Total number of etcd members currently reporting metrics.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Has Leader",
|
||||||
|
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0",
|
||||||
|
"mappings": [
|
||||||
|
{ "type": "value", "options": {
|
||||||
|
"0": { "text": "NO LEADER", "color": "red" },
|
||||||
|
"1": { "text": "OK", "color": "green" }
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||||
|
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||||
|
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 2147483648 },
|
||||||
|
{ "color": "orange", "value": 5368709120 },
|
||||||
|
{ "color": "red", "value": 7516192768 }
|
||||||
|
]},
|
||||||
|
"unit": "bytes", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||||
|
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 25 },
|
||||||
|
{ "color": "orange", "value": 50 },
|
||||||
|
{ "color": "red", "value": 75 }
|
||||||
|
]},
|
||||||
|
"unit": "percent", "noValue": "0", "decimals": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||||
|
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||||
|
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.01 },
|
||||||
|
{ "color": "orange", "value": 0.1 },
|
||||||
|
{ "color": "red", "value": 0.5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||||
|
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.025 },
|
||||||
|
{ "color": "orange", "value": 0.1 },
|
||||||
|
{ "color": "red", "value": 0.25 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||||
|
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "max": 1.1,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||||
|
"mappings": [
|
||||||
|
{ "type": "value", "options": {
|
||||||
|
"0": { "text": "0 — no leader" },
|
||||||
|
"1": { "text": "1 — ok" }
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||||
|
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||||
|
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||||
|
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||||
|
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||||
|
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||||
|
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||||
|
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line+area" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 5 },
|
||||||
|
{ "color": "red", "value": 10 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||||
|
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||||
|
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||||
|
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||||
|
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||||
|
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||||
|
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||||
|
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||||
|
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||||
|
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||||
|
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||||
|
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||||
|
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.lineWidth", "value": 1 },
|
||||||
|
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||||
|
{ "id": "custom.fillOpacity","value": 0 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||||
|
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||||
|
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,752 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-control-plane-health
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Control Plane Health",
|
||||||
|
"uid": "okd-control-plane",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "control-plane"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "API Server Instance",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||||
|
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||||
|
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||||
|
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||||
|
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.01 },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||||
|
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 500 },
|
||||||
|
{ "color": "orange", "value": 750 },
|
||||||
|
{ "color": "red", "value": 900 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||||
|
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1500 },
|
||||||
|
{ "color": "orange", "value": 2200 },
|
||||||
|
{ "color": "red", "value": 2700 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||||
|
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.5 },
|
||||||
|
{ "color": "orange", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||||
|
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.05 },
|
||||||
|
{ "color": "orange", "value": 0.2 },
|
||||||
|
{ "color": "red", "value": 0.5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||||
|
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{verb}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||||
|
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||||
|
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||||
|
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||||
|
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{verb}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||||
|
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||||
|
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{resource}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||||
|
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{kind}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||||
|
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||||
|
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||||
|
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.5 },
|
||||||
|
{ "color": "red", "value": 2.0 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||||
|
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||||
|
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "red", "value": 50 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||||
|
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||||
|
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||||
|
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{result}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||||
|
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||||
|
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||||
|
"refId": "A", "legendFormat": "{{queue}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "red", "value": 50 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||||
|
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percentunit", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||||
|
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||||
|
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,741 @@
|
|||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: okd-alerts-events
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
json: |
|
||||||
|
{
|
||||||
|
"title": "Alerts & Events — Active Problems",
|
||||||
|
"uid": "okd-alerts-events",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-3h", "to": "now" },
|
||||||
|
"tags": ["okd", "alerts", "events"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "severity",
|
||||||
|
"type": "custom",
|
||||||
|
"label": "Severity Filter",
|
||||||
|
"query": "critical,warning,info",
|
||||||
|
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||||
|
"includeAll": true,
|
||||||
|
"allValue": "critical|warning|info",
|
||||||
|
"multi": false,
|
||||||
|
"options": [
|
||||||
|
{ "selected": true, "text": "All", "value": "$__all" },
|
||||||
|
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||||
|
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||||
|
{ "selected": false, "text": "Info", "value": "info" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"label": "Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"multi": true,
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||||
|
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||||
|
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "orange", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||||
|
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "blue", "value": 1 },
|
||||||
|
{ "color": "blue", "value": 25 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||||
|
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 20 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||||
|
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||||
|
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "orange", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||||
|
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||||
|
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||||
|
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{severity}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||||
|
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||||
|
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||||
|
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||||
|
{ "id": "custom.lineWidth", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||||
|
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 300 },
|
||||||
|
{ "color": "orange", "value": 1800 },
|
||||||
|
{ "color": "red", "value": 7200 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"valueMode": "color"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||||
|
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"alertstate": true,
|
||||||
|
"__name__": true,
|
||||||
|
"Value": true,
|
||||||
|
"Time": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"alertname": "Alert Name",
|
||||||
|
"severity": "Severity",
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"node": "Node",
|
||||||
|
"container": "Container",
|
||||||
|
"job": "Job",
|
||||||
|
"service": "Service",
|
||||||
|
"reason": "Reason",
|
||||||
|
"instance": "Instance"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"severity": 0,
|
||||||
|
"alertname": 1,
|
||||||
|
"namespace": 2,
|
||||||
|
"pod": 3,
|
||||||
|
"node": 4,
|
||||||
|
"container": 5,
|
||||||
|
"job": 6,
|
||||||
|
"service": 7,
|
||||||
|
"reason": 8,
|
||||||
|
"instance": 9
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Severity" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 110 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||||
|
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||||
|
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||||
|
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{reason}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||||
|
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "orange", "value": 50 },
|
||||||
|
{ "color": "red", "value": 200 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||||
|
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{reason}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||||
|
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||||
|
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||||
|
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||||
|
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||||
|
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "kube_node_status_condition == 1",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Value": true,
|
||||||
|
"__name__": true,
|
||||||
|
"endpoint": true,
|
||||||
|
"job": true,
|
||||||
|
"service": true,
|
||||||
|
"instance": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"node": "Node",
|
||||||
|
"condition": "Condition",
|
||||||
|
"status": "Status"
|
||||||
|
},
|
||||||
|
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Status" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 90 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"true": { "text": "true", "color": "green", "index": 0 },
|
||||||
|
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||||
|
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Condition" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.width", "value": 190 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-text" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"Ready": { "color": "green", "index": 0 },
|
||||||
|
"MemoryPressure": { "color": "red", "index": 1 },
|
||||||
|
"DiskPressure": { "color": "red", "index": 2 },
|
||||||
|
"PIDPressure": { "color": "red", "index": 3 },
|
||||||
|
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||||
|
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Value": true,
|
||||||
|
"__name__": true,
|
||||||
|
"endpoint": true,
|
||||||
|
"job": true,
|
||||||
|
"service": true,
|
||||||
|
"instance": true,
|
||||||
|
"namespace": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"name": "Operator",
|
||||||
|
"condition": "Condition",
|
||||||
|
"reason": "Reason"
|
||||||
|
},
|
||||||
|
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Condition" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 140 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||||
|
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
# These are probably already created by rook-ceph operator, not sure, needs to validate.
|
||||||
|
# in fact, 100% sure for the second one (rook-ceph-exporter)
|
||||||
|
# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working
|
||||||
|
# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
name: rook-ceph-mgr
|
||||||
|
namespace: rook-ceph
|
||||||
|
labels:
|
||||||
|
# This specific label is what tells OKD's Prometheus to pick this up
|
||||||
|
openshift.io/cluster-monitoring: "true"
|
||||||
|
spec:
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- rook-ceph
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
# This matches your 'rook-ceph-mgr' service
|
||||||
|
app: rook-ceph-mgr
|
||||||
|
endpoints:
|
||||||
|
- port: ""
|
||||||
|
# The port name in your service is empty/integers, so we use targetPort
|
||||||
|
targetPort: 9283
|
||||||
|
path: /metrics
|
||||||
|
interval: 30s
|
||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
name: rook-ceph-exporter
|
||||||
|
namespace: rook-ceph
|
||||||
|
labels:
|
||||||
|
# This label is required for OKD cluster-wide monitoring to pick it up
|
||||||
|
openshift.io/cluster-monitoring: "true"
|
||||||
|
team: rook
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- honorLabels: true
|
||||||
|
interval: 10s
|
||||||
|
path: /metrics
|
||||||
|
port: ceph-exporter-http-metrics
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- rook-ceph
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: rook-ceph-exporter
|
||||||
|
rook_cluster: rook-ceph
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: rook-ceph-metrics-viewer
|
||||||
|
namespace: rook-ceph
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["services", "endpoints", "pods"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: rook-ceph-metrics-viewer
|
||||||
|
namespace: rook-ceph
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: rook-ceph-metrics-viewer
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: openshift-monitoring
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: rook-ceph
|
||||||
|
labels:
|
||||||
|
# This is the critical label that allows OKD Prometheus to see the namespace
|
||||||
|
openshift.io/cluster-monitoring: "true"
|
||||||
@@ -0,0 +1,731 @@
|
|||||||
|
{
|
||||||
|
"title": "Alerts & Events — Active Problems",
|
||||||
|
"uid": "okd-alerts-events",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-3h", "to": "now" },
|
||||||
|
"tags": ["okd", "alerts", "events"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "severity",
|
||||||
|
"type": "custom",
|
||||||
|
"label": "Severity Filter",
|
||||||
|
"query": "critical,warning,info",
|
||||||
|
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||||
|
"includeAll": true,
|
||||||
|
"allValue": "critical|warning|info",
|
||||||
|
"multi": false,
|
||||||
|
"options": [
|
||||||
|
{ "selected": true, "text": "All", "value": "$__all" },
|
||||||
|
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||||
|
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||||
|
{ "selected": false, "text": "Info", "value": "info" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"label": "Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"multi": true,
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||||
|
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||||
|
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "orange", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||||
|
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "blue", "value": 1 },
|
||||||
|
{ "color": "blue", "value": 25 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||||
|
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 20 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||||
|
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||||
|
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "orange", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||||
|
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||||
|
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||||
|
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{severity}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||||
|
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||||
|
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||||
|
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||||
|
{ "id": "custom.lineWidth", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||||
|
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 300 },
|
||||||
|
{ "color": "orange", "value": 1800 },
|
||||||
|
{ "color": "red", "value": 7200 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"valueMode": "color"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||||
|
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"alertstate": true,
|
||||||
|
"__name__": true,
|
||||||
|
"Value": true,
|
||||||
|
"Time": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"alertname": "Alert Name",
|
||||||
|
"severity": "Severity",
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"node": "Node",
|
||||||
|
"container": "Container",
|
||||||
|
"job": "Job",
|
||||||
|
"service": "Service",
|
||||||
|
"reason": "Reason",
|
||||||
|
"instance": "Instance"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"severity": 0,
|
||||||
|
"alertname": 1,
|
||||||
|
"namespace": 2,
|
||||||
|
"pod": 3,
|
||||||
|
"node": 4,
|
||||||
|
"container": 5,
|
||||||
|
"job": 6,
|
||||||
|
"service": 7,
|
||||||
|
"reason": 8,
|
||||||
|
"instance": 9
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Severity" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 110 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||||
|
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||||
|
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||||
|
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{reason}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||||
|
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "orange", "value": 50 },
|
||||||
|
{ "color": "red", "value": 200 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||||
|
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{reason}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||||
|
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||||
|
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||||
|
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||||
|
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||||
|
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "kube_node_status_condition == 1",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Value": true,
|
||||||
|
"__name__": true,
|
||||||
|
"endpoint": true,
|
||||||
|
"job": true,
|
||||||
|
"service": true,
|
||||||
|
"instance": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"node": "Node",
|
||||||
|
"condition": "Condition",
|
||||||
|
"status": "Status"
|
||||||
|
},
|
||||||
|
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Status" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 90 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"true": { "text": "true", "color": "green", "index": 0 },
|
||||||
|
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||||
|
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Condition" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.width", "value": 190 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-text" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"Ready": { "color": "green", "index": 0 },
|
||||||
|
"MemoryPressure": { "color": "red", "index": 1 },
|
||||||
|
"DiskPressure": { "color": "red", "index": 2 },
|
||||||
|
"PIDPressure": { "color": "red", "index": 3 },
|
||||||
|
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||||
|
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Value": true,
|
||||||
|
"__name__": true,
|
||||||
|
"endpoint": true,
|
||||||
|
"job": true,
|
||||||
|
"service": true,
|
||||||
|
"instance": true,
|
||||||
|
"namespace": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"name": "Operator",
|
||||||
|
"condition": "Condition",
|
||||||
|
"reason": "Reason"
|
||||||
|
},
|
||||||
|
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "align": "left", "filterable": true },
|
||||||
|
"noValue": "—"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Condition" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "custom.width", "value": 140 },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||||
|
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||||
|
"footer": { "show": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,739 @@
|
|||||||
|
{
|
||||||
|
"title": "Cluster Overview",
|
||||||
|
"uid": "okd-cluster-overview",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 2,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "cluster", "overview"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Not Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Running Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Pending Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Failed Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "CrashLoopBackOff",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Critical Alerts",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Warning Alerts",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 10 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "CPU"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Memory"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 75 },
|
||||||
|
{ "color": "red", "value": 90 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "gauge",
|
||||||
|
"title": "Root Disk Usage",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Disk"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true,
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "etcd Has Leader",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "min(etcd_server_has_leader)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "NO LEADER", "color": "red" },
|
||||||
|
"1": { "text": "LEADER OK", "color": "green" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "?"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "API Servers Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(up{job=\"apiserver\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 2 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "etcd Members Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(up{job=\"etcd\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 2 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Operators Degraded",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short",
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"justifyMode": "center",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Network Traffic — Cluster Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Receive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Transmit"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Receive" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Transmit" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "max"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Pod Phases Over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Running"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Pending"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "Failed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||||
|
"refId": "D",
|
||||||
|
"legendFormat": "Unknown"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 15,
|
||||||
|
"spanNulls": false,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Running" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Pending" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Failed" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unknown" },
|
||||||
|
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["lastNotNull"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,742 @@
|
|||||||
|
{
|
||||||
|
"title": "Control Plane Health",
|
||||||
|
"uid": "okd-control-plane",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "control-plane"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "API Server Instance",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||||
|
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||||
|
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||||
|
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||||
|
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.01 },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||||
|
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 500 },
|
||||||
|
{ "color": "orange", "value": 750 },
|
||||||
|
{ "color": "red", "value": 900 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||||
|
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1500 },
|
||||||
|
{ "color": "orange", "value": 2200 },
|
||||||
|
{ "color": "red", "value": 2700 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||||
|
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.5 },
|
||||||
|
{ "color": "orange", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||||
|
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.05 },
|
||||||
|
{ "color": "orange", "value": 0.2 },
|
||||||
|
{ "color": "red", "value": 0.5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||||
|
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{verb}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||||
|
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||||
|
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||||
|
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||||
|
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{verb}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||||
|
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||||
|
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{resource}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||||
|
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{kind}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||||
|
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||||
|
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||||
|
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.5 },
|
||||||
|
{ "color": "red", "value": 2.0 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||||
|
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||||
|
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "red", "value": 50 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||||
|
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||||
|
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{name}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||||
|
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{result}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||||
|
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||||
|
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||||
|
"refId": "A", "legendFormat": "{{queue}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10 },
|
||||||
|
{ "color": "red", "value": 50 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||||
|
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percentunit", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||||
|
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||||
|
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||||
|
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,734 @@
|
|||||||
|
{
|
||||||
|
"title": "etcd",
|
||||||
|
"uid": "okd-etcd",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "etcd"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Instance",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||||
|
"description": "Total number of etcd members currently reporting metrics.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "green", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Has Leader",
|
||||||
|
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0",
|
||||||
|
"mappings": [
|
||||||
|
{ "type": "value", "options": {
|
||||||
|
"0": { "text": "NO LEADER", "color": "red" },
|
||||||
|
"1": { "text": "OK", "color": "green" }
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||||
|
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 3 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||||
|
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 2147483648 },
|
||||||
|
{ "color": "orange", "value": 5368709120 },
|
||||||
|
{ "color": "red", "value": 7516192768 }
|
||||||
|
]},
|
||||||
|
"unit": "bytes", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||||
|
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 25 },
|
||||||
|
{ "color": "orange", "value": 50 },
|
||||||
|
{ "color": "red", "value": 75 }
|
||||||
|
]},
|
||||||
|
"unit": "percent", "noValue": "0", "decimals": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||||
|
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]},
|
||||||
|
"unit": "short", "noValue": "0", "decimals": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||||
|
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.01 },
|
||||||
|
{ "color": "orange", "value": 0.1 },
|
||||||
|
{ "color": "red", "value": 0.5 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||||
|
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 0.025 },
|
||||||
|
{ "color": "orange", "value": 0.1 },
|
||||||
|
{ "color": "red", "value": 0.25 }
|
||||||
|
]},
|
||||||
|
"unit": "s", "noValue": "0", "decimals": 4
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||||
|
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "max": 1.1,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||||
|
"mappings": [
|
||||||
|
{ "type": "value", "options": {
|
||||||
|
"0": { "text": "0 — no leader" },
|
||||||
|
"1": { "text": "1 — ok" }
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||||
|
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "none" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||||
|
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||||
|
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||||
|
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||||
|
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||||
|
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||||
|
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||||
|
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line+area" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 5 },
|
||||||
|
{ "color": "red", "value": 10 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||||
|
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||||
|
"thresholdsStyle": { "mode": "line" }
|
||||||
|
},
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||||
|
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||||
|
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||||
|
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||||
|
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||||
|
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||||
|
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||||
|
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||||
|
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||||
|
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||||
|
"refId": "A", "legendFormat": "{{instance}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||||
|
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||||
|
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.lineWidth", "value": 1 },
|
||||||
|
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||||
|
{ "id": "custom.fillOpacity","value": 0 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||||
|
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||||
|
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||||
|
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,945 @@
|
|||||||
|
{
|
||||||
|
"title": "Networking",
|
||||||
|
"uid": "okd-networking",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "networking"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Namespace",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "Bps", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "Bps", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "pps", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||||
|
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]},
|
||||||
|
"unit": "percent", "noValue": "0", "decimals": 2
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Pod Network I/O Summary",
|
||||||
|
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "pod", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true,
|
||||||
|
"namespace 4": true,
|
||||||
|
"namespace 5": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"Value": "RX Rate",
|
||||||
|
"Value 1": "TX Rate",
|
||||||
|
"Value 2": "RX Errors/s",
|
||||||
|
"Value 3": "TX Errors/s",
|
||||||
|
"Value 4": "RX Drops/s",
|
||||||
|
"Value 5": "TX Drops/s"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"pod": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5,
|
||||||
|
"Value 4": 6,
|
||||||
|
"Value 5": 7
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Pod" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "Bps" },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 10000000 },
|
||||||
|
{ "color": "orange", "value": 100000000 },
|
||||||
|
{ "color": "red", "value": 500000000 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "pps" },
|
||||||
|
{ "id": "decimals", "value": 3 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 0.001 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "unit", "value": "pps" },
|
||||||
|
{ "id": "decimals", "value": 3 },
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "orange", "value": 0.001 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "pps", "min": 0, "decimals": 3,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{type}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||||
|
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{rcode}}"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "A", "legendFormat": "p50"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "B", "legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"refId": "C", "legendFormat": "p99"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s", "min": 0, "decimals": 4,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||||
|
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||||
|
"refId": "A", "legendFormat": "Cache Hit %"
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 50 },
|
||||||
|
{ "color": "green", "value": 80 }
|
||||||
|
]},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "single" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||||
|
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||||
|
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 28, "type": "stat", "title": "Total Services",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||||
|
"refId": "A", "legendFormat": ""
|
||||||
|
}],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Endpoint Availability",
|
||||||
|
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "endpoint", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": { "namespace 1": true },
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"endpoint": "Endpoint",
|
||||||
|
"Value": "Available",
|
||||||
|
"Value 1": "Not Ready"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"endpoint": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Available" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||||
|
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||||
|
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||||
|
"refId": "A", "legendFormat": "4xx %"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||||
|
"refId": "B", "legendFormat": "5xx %"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 5 }
|
||||||
|
]}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "Bytes In"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||||
|
"refId": "B", "legendFormat": "Bytes Out"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 36,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Router Backend Server Status",
|
||||||
|
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "haproxy_server_up",
|
||||||
|
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {},
|
||||||
|
"renameByName": {
|
||||||
|
"proxy": "Backend",
|
||||||
|
"server": "Server",
|
||||||
|
"Value": "Status"
|
||||||
|
},
|
||||||
|
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Backend" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Server" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Status" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "mappings", "value": [
|
||||||
|
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||||
|
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||||
|
]},
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,627 @@
|
|||||||
|
{
|
||||||
|
"title": "Node Health",
|
||||||
|
"uid": "okd-node-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 2,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "node", "health"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "node",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Node",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Total Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Not Ready Nodes",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Memory Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Disk Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "PID Pressure",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Unschedulable",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Kubelet Up",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Node Conditions",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||||
|
"refId": "E",
|
||||||
|
"legendFormat": "{{node}}",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "labelsToFields",
|
||||||
|
"options": { "mode": "columns" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": { "byField": "node", "mode": "outer" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"Time 2": true,
|
||||||
|
"Time 3": true,
|
||||||
|
"Time 4": true,
|
||||||
|
"Time 5": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"node": "Node",
|
||||||
|
"Value #A": "Ready",
|
||||||
|
"Value #B": "Mem Pressure",
|
||||||
|
"Value #C": "Disk Pressure",
|
||||||
|
"Value #D": "PID Pressure",
|
||||||
|
"Value #E": "Unschedulable"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"node": 0,
|
||||||
|
"Value #A": 1,
|
||||||
|
"Value #B": 2,
|
||||||
|
"Value #C": 3,
|
||||||
|
"Value #D": 4,
|
||||||
|
"Value #E": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": { "displayMode": "color-background", "align": "center" }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Node" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "auto" },
|
||||||
|
{ "id": "custom.align", "value": "left" },
|
||||||
|
{ "id": "custom.width", "value": 200 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||||
|
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||||
|
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||||
|
},
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "mappings",
|
||||||
|
"value": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||||
|
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "CPU Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Memory Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Root Disk Usage per Node (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Root Disk Usage \u2014 Current",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Network Traffic per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "rx {{instance}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "tx {{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Pods per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 100 },
|
||||||
|
{ "color": "red", "value": 200 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "System Load Average (1m) per Node",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "node_load1",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "1m \u2014 {{instance}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "node_load5",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "5m \u2014 {{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Node Uptime",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - node_boot_time_seconds",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"min": 0,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "yellow", "value": 300 },
|
||||||
|
{ "color": "green", "value": 3600 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": false,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,596 @@
|
|||||||
|
{
|
||||||
|
"title": "Storage Health",
|
||||||
|
"uid": "storage-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 1,
|
||||||
|
"title": "PVC / PV Status",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 2,
|
||||||
|
"title": "Bound PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "green", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 3,
|
||||||
|
"title": "Pending PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 4,
|
||||||
|
"title": "Lost PVCs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "red", "value": 1 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 5,
|
||||||
|
"title": "Bound PVs / Available PVs",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Bound"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Available"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 6,
|
||||||
|
"title": "Ceph Cluster Health",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_health_status",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 1 },
|
||||||
|
{ "color": "red", "value": 2 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||||
|
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||||
|
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "value"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 7,
|
||||||
|
"title": "OSDs Up / Total",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Up"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Total"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "green", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 8,
|
||||||
|
"title": "Cluster Capacity",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "gauge",
|
||||||
|
"id": 9,
|
||||||
|
"title": "Ceph Cluster Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"showThresholdLabels": true,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"id": 10,
|
||||||
|
"title": "Ceph Capacity — Total / Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_cluster_total_bytes",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Total"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Available"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto",
|
||||||
|
"orientation": "vertical"
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 11,
|
||||||
|
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{storageclass}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{ "color": "blue", "value": null }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "piechart",
|
||||||
|
"id": 12,
|
||||||
|
"title": "PVC Phase Distribution",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Bound"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Pending"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||||
|
"refId": "C",
|
||||||
|
"legendFormat": "Lost"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "color": { "mode": "palette-classic" } }
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"pieType": "pie",
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right",
|
||||||
|
"values": ["value", "percent"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 13,
|
||||||
|
"title": "Ceph Performance",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 14,
|
||||||
|
"title": "Ceph Pool IOPS (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_rd[5m])",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Read — pool {{pool_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_wr[5m])",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Write — pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "ops",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 15,
|
||||||
|
"title": "Ceph Pool Throughput (Read / Write)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Read — pool {{pool_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||||
|
"refId": "B",
|
||||||
|
"legendFormat": "Write — pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 16,
|
||||||
|
"title": "Ceph OSD & Pool Details",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 17,
|
||||||
|
"title": "Ceph Pool Space Used (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "Pool {{pool_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 18,
|
||||||
|
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "ceph_osd_up",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{ceph_daemon}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 1,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "red", "value": null },
|
||||||
|
{ "color": "green", "value": 1 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": { "text": "DOWN", "index": 0 },
|
||||||
|
"1": { "text": "UP", "index": 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "basic",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "row",
|
||||||
|
"id": 19,
|
||||||
|
"title": "Node Disk Usage",
|
||||||
|
"collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"id": 20,
|
||||||
|
"title": "Node Root Disk Usage Over Time (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "bargauge",
|
||||||
|
"id": 21,
|
||||||
|
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{ "color": "green", "value": null },
|
||||||
|
{ "color": "yellow", "value": 70 },
|
||||||
|
{ "color": "red", "value": 85 }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,773 @@
|
|||||||
|
{
|
||||||
|
"title": "Workload Health",
|
||||||
|
"uid": "okd-workload-health",
|
||||||
|
"schemaVersion": 36,
|
||||||
|
"version": 3,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"tags": ["okd", "workload", "health"],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"allValue": ".*",
|
||||||
|
"label": "Namespace",
|
||||||
|
"sort": 1,
|
||||||
|
"current": {},
|
||||||
|
"options": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1, "type": "stat", "title": "Total Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 2, "type": "stat", "title": "Running Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||||
|
"unit": "short", "noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||||
|
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Deployment Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "E",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "deployment", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "deployment",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true,
|
||||||
|
"namespace 4": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"deployment": "Deployment",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Available",
|
||||||
|
"Value 3": "Unavailable",
|
||||||
|
"Value 4": "Up-to-date"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"deployment": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5,
|
||||||
|
"Value 4": 6
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": {
|
||||||
|
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Deployment" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "table",
|
||||||
|
"title": "StatefulSet Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "statefulset", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "statefulset",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"statefulset": "StatefulSet",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Current",
|
||||||
|
"Value 3": "Up-to-date"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"statefulset": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"type": "table",
|
||||||
|
"title": "DaemonSet Status",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "D",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": ["namespace", "daemonset", "Value"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "daemonset",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"namespace 1": true,
|
||||||
|
"namespace 2": true,
|
||||||
|
"namespace 3": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"daemonset": "DaemonSet",
|
||||||
|
"Value": "Desired",
|
||||||
|
"Value 1": "Ready",
|
||||||
|
"Value 2": "Unavailable",
|
||||||
|
"Value 3": "Misscheduled"
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"namespace": 0,
|
||||||
|
"daemonset": 1,
|
||||||
|
"Value": 2,
|
||||||
|
"Value 1": 3,
|
||||||
|
"Value 2": 4,
|
||||||
|
"Value 3": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Namespace" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||||
|
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Ready" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Pod Phase over Time",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{phase}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"type": "piechart",
|
||||||
|
"title": "Pod Phase — Now",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||||
|
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"pieType": "donut",
|
||||||
|
"tooltip": { "mode": "single" },
|
||||||
|
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Container Restarts over Time (total counter, top 10)",
|
||||||
|
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{namespace}} / {{pod}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Container Total Restarts (non-zero)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {},
|
||||||
|
"renameByName": {
|
||||||
|
"namespace": "Namespace",
|
||||||
|
"pod": "Pod",
|
||||||
|
"container": "Container",
|
||||||
|
"Value": "Total Restarts"
|
||||||
|
},
|
||||||
|
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||||
|
"overrides": [
|
||||||
|
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||||
|
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||||
|
{
|
||||||
|
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||||
|
"properties": [
|
||||||
|
{ "id": "custom.displayMode", "value": "color-background" },
|
||||||
|
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||||
|
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Usage by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cores", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Usage by Namespace",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes", "min": 0,
|
||||||
|
"color": { "mode": "palette-classic" },
|
||||||
|
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||||
|
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "CPU — Actual vs Requested (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 150,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"type": "bargauge",
|
||||||
|
"title": "Memory — Actual vs Requested (%)",
|
||||||
|
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||||
|
"refId": "A", "legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent", "min": 0, "max": 150,
|
||||||
|
"color": { "mode": "thresholds" },
|
||||||
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||||
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||||
|
},
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
mod score;
|
||||||
|
pub use score::ClusterDashboardsScore;
|
||||||
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
@@ -0,0 +1,507 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use k8s_openapi::api::core::v1::{Namespace, Secret};
|
||||||
|
use kube::{api::ObjectMeta, api::DynamicObject};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_yaml;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
data::Version,
|
||||||
|
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::k8s::resource::K8sResourceScore,
|
||||||
|
modules::okd::crd::route::Route,
|
||||||
|
score::Score,
|
||||||
|
topology::{K8sclient, Topology},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct ClusterDashboardsScore {
|
||||||
|
pub namespace: String,
|
||||||
|
pub grafana_admin_user: String,
|
||||||
|
pub grafana_admin_password: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ClusterDashboardsScore {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
namespace: "harmony-observability".to_string(),
|
||||||
|
grafana_admin_user: "admin".to_string(),
|
||||||
|
grafana_admin_password: "password".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClusterDashboardsScore {
|
||||||
|
pub fn new(namespace: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
namespace: namespace.to_string(),
|
||||||
|
grafana_admin_user: "admin".to_string(),
|
||||||
|
grafana_admin_password: "password".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_credentials(namespace: &str, admin_user: &str, admin_password: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
namespace: namespace.to_string(),
|
||||||
|
grafana_admin_user: admin_user.to_string(),
|
||||||
|
grafana_admin_password: admin_password.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Topology + K8sclient> Score<T> for ClusterDashboardsScore {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
format!("ClusterDashboardsScore({})", self.namespace)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[doc(hidden)]
|
||||||
|
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||||
|
Box::new(ClusterDashboardsInterpret {
|
||||||
|
namespace: self.namespace.clone(),
|
||||||
|
grafana_admin_user: self.grafana_admin_user.clone(),
|
||||||
|
grafana_admin_password: self.grafana_admin_password.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ClusterDashboardsInterpret {
|
||||||
|
namespace: String,
|
||||||
|
grafana_admin_user: String,
|
||||||
|
grafana_admin_password: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
||||||
|
async fn execute(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &T,
|
||||||
|
) -> Result<Outcome, InterpretError> {
|
||||||
|
self.create_namespace(inventory, topology).await?;
|
||||||
|
self.create_rbac_resources(inventory, topology).await?;
|
||||||
|
self.create_secret(inventory, topology).await?;
|
||||||
|
self.create_grafana(inventory, topology).await?;
|
||||||
|
self.create_datasource(inventory, topology).await?;
|
||||||
|
self.create_dashboards(inventory, topology).await?;
|
||||||
|
self.create_route(inventory, topology).await?;
|
||||||
|
|
||||||
|
Ok(Outcome::success(format!(
|
||||||
|
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||||
|
self.namespace,
|
||||||
|
8
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_name(&self) -> InterpretName {
|
||||||
|
InterpretName::Custom("ClusterDashboards")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_version(&self) -> Version {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_status(&self) -> InterpretStatus {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_children(&self) -> Vec<Id> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClusterDashboardsInterpret {
|
||||||
|
async fn create_namespace(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let mut labels = BTreeMap::new();
|
||||||
|
labels.insert(
|
||||||
|
"openshift.io/cluster-monitoring".to_string(),
|
||||||
|
"true".to_string(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let namespace = Namespace {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(self.namespace.clone()),
|
||||||
|
labels: Some(labels),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
..Namespace::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
K8sResourceScore::single(namespace, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_rbac_resources(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let service_account_name = "cluster-grafana-sa".to_string();
|
||||||
|
let rbac_namespace = self.namespace.clone();
|
||||||
|
|
||||||
|
let service_account = {
|
||||||
|
use k8s_openapi::api::core::v1::ServiceAccount;
|
||||||
|
ServiceAccount {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(service_account_name.clone()),
|
||||||
|
namespace: Some(rbac_namespace.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
..ServiceAccount::default()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let cluster_role = {
|
||||||
|
use k8s_openapi::api::rbac::v1::{ClusterRole, PolicyRule};
|
||||||
|
ClusterRole {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("grafana-prometheus-api-access".to_string()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
rules: Some(vec![PolicyRule {
|
||||||
|
api_groups: Some(vec!["monitoring.coreos.com".to_string()]),
|
||||||
|
resources: Some(vec!["prometheuses/api".to_string()]),
|
||||||
|
verbs: vec!["get".to_string()],
|
||||||
|
..PolicyRule::default()
|
||||||
|
}]),
|
||||||
|
..ClusterRole::default()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let cluster_role_binding = {
|
||||||
|
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||||
|
ClusterRoleBinding {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("grafana-prometheus-api-access-binding".to_string()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
subjects: Some(vec![Subject {
|
||||||
|
kind: "ServiceAccount".to_string(),
|
||||||
|
name: service_account_name.clone(),
|
||||||
|
namespace: Some(rbac_namespace.clone()),
|
||||||
|
..Subject::default()
|
||||||
|
}]),
|
||||||
|
role_ref: RoleRef {
|
||||||
|
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||||
|
kind: "ClusterRole".to_string(),
|
||||||
|
name: "grafana-prometheus-api-access".to_string(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let cluster_role_binding_cluster_monitoring = {
|
||||||
|
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||||
|
ClusterRoleBinding {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("grafana-cluster-monitoring-view".to_string()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
subjects: Some(vec![Subject {
|
||||||
|
kind: "ServiceAccount".to_string(),
|
||||||
|
name: service_account_name.clone(),
|
||||||
|
namespace: Some(rbac_namespace.clone()),
|
||||||
|
..Subject::default()
|
||||||
|
}]),
|
||||||
|
role_ref: RoleRef {
|
||||||
|
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||||
|
kind: "ClusterRole".to_string(),
|
||||||
|
name: "cluster-monitoring-view".to_string(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
K8sResourceScore::single(service_account, Some(rbac_namespace.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(cluster_role, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(cluster_role_binding, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(cluster_role_binding_cluster_monitoring, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_secret(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let service_account_name = "cluster-grafana-sa".to_string();
|
||||||
|
let secret_name = "grafana-prometheus-token".to_string();
|
||||||
|
let secret_namespace = self.namespace.clone();
|
||||||
|
|
||||||
|
let secret = Secret {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(secret_name),
|
||||||
|
namespace: Some(secret_namespace),
|
||||||
|
annotations: Some({
|
||||||
|
let mut ann = BTreeMap::new();
|
||||||
|
ann.insert(
|
||||||
|
"kubernetes.io/service-account.name".to_string(),
|
||||||
|
service_account_name,
|
||||||
|
);
|
||||||
|
ann
|
||||||
|
}),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
type_: Some("kubernetes.io/service-account-token".to_string()),
|
||||||
|
..Secret::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
K8sResourceScore::single(secret, Some(self.namespace.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_grafana(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let labels: BTreeMap<String, String> = vec![
|
||||||
|
("dashboards".to_string(), "grafana".to_string()),
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let client = topology
|
||||||
|
.k8s_client()
|
||||||
|
.await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||||
|
|
||||||
|
let mut annotations = BTreeMap::new();
|
||||||
|
annotations.insert(
|
||||||
|
"kubectl.kubernetes.io/last-applied-configuration".to_string(),
|
||||||
|
"".to_string(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let grafana_yaml = format!(r#"
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: Grafana
|
||||||
|
metadata:
|
||||||
|
name: cluster-grafana
|
||||||
|
namespace: {}
|
||||||
|
labels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
spec:
|
||||||
|
config:
|
||||||
|
log:
|
||||||
|
mode: console
|
||||||
|
security:
|
||||||
|
admin_user: {}
|
||||||
|
admin_password: {}
|
||||||
|
users:
|
||||||
|
viewers_can_edit: "false"
|
||||||
|
auth:
|
||||||
|
disable_login_form: "false"
|
||||||
|
"auth.anonymous":
|
||||||
|
enabled: "true"
|
||||||
|
org_role: "Viewer"
|
||||||
|
deployment:
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: grafana
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: "1"
|
||||||
|
memory: 2Gi
|
||||||
|
"#, self.namespace, self.grafana_admin_user, self.grafana_admin_password);
|
||||||
|
|
||||||
|
let grafana_value: serde_json::Value = serde_yaml::from_str(grafana_yaml.as_str())
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to parse Grafana YAML: {e}")))?;
|
||||||
|
|
||||||
|
let grafana: DynamicObject = serde_json::from_value(grafana_value)
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||||
|
|
||||||
|
client.apply_dynamic(&grafana, Some(&self.namespace), false).await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to apply Grafana: {e}")))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_datasource(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let labels: BTreeMap<String, String> = vec![
|
||||||
|
("datasource".to_string(), "prometheus".to_string()),
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let client = topology
|
||||||
|
.k8s_client()
|
||||||
|
.await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||||
|
|
||||||
|
let secure_json_data_value = "Bearer ${token}";
|
||||||
|
|
||||||
|
let datasource_yaml = format!(r#"
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDatasource
|
||||||
|
metadata:
|
||||||
|
name: prometheus-cluster
|
||||||
|
namespace: {}
|
||||||
|
labels:
|
||||||
|
datasource: "prometheus"
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: "grafana"
|
||||||
|
valuesFrom:
|
||||||
|
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-prometheus-token
|
||||||
|
key: token
|
||||||
|
datasource:
|
||||||
|
name: Prometheus-Cluster
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||||
|
isDefault: true
|
||||||
|
jsonData:
|
||||||
|
httpHeaderName1: "Authorization"
|
||||||
|
tlsSkipVerify: true
|
||||||
|
timeInterval: "30s"
|
||||||
|
secureJsonData:
|
||||||
|
httpHeaderValue1: "{}"
|
||||||
|
"#, self.namespace, secure_json_data_value);
|
||||||
|
|
||||||
|
let datasource_value: serde_json::Value = serde_yaml::from_str(datasource_yaml.as_str())
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to parse Datasource YAML: {e}")))?;
|
||||||
|
|
||||||
|
let datasource: DynamicObject = serde_json::from_value(datasource_value)
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||||
|
|
||||||
|
client.apply_dynamic(&datasource, Some(&self.namespace), false).await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to apply Datasource: {e}")))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_dashboards(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let client = topology
|
||||||
|
.k8s_client()
|
||||||
|
.await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||||
|
|
||||||
|
let dashboards: &[(&str, &str)] = &[
|
||||||
|
("okd-cluster-overview", include_str!("dashboards/cluster-overview.json")),
|
||||||
|
("okd-node-health", include_str!("dashboards/nodes-health.json")),
|
||||||
|
("okd-workload-health", include_str!("dashboards/workloads-health.json")),
|
||||||
|
("okd-networking", include_str!("dashboards/networking.json")),
|
||||||
|
("storage-health", include_str!("dashboards/storage.json")),
|
||||||
|
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||||
|
("okd-control-plane", include_str!("dashboards/control-plane.json")),
|
||||||
|
("okd-alerts-events", include_str!("dashboards/alerts-events-problems.json")),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (dashboard_name, json_content) in dashboards {
|
||||||
|
let dashboard: DynamicObject = serde_json::from_value(serde_json::json!({
|
||||||
|
"apiVersion": "grafana.integreatly.org/v1beta1",
|
||||||
|
"kind": "GrafanaDashboard",
|
||||||
|
"metadata": {
|
||||||
|
"name": dashboard_name,
|
||||||
|
"namespace": self.namespace,
|
||||||
|
"labels": {
|
||||||
|
"dashboard": dashboard_name
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"instanceSelector": {
|
||||||
|
"matchLabels": {
|
||||||
|
"dashboards": "grafana"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"json": json_content
|
||||||
|
}
|
||||||
|
})).map_err(|e| InterpretError::new(format!("Failed to create Dashboard {} DynamicObject: {e}", dashboard_name)))?;
|
||||||
|
|
||||||
|
client.apply_dynamic(&dashboard, Some(&self.namespace), false).await
|
||||||
|
.map_err(|e| InterpretError::new(format!("Failed to apply Dashboard {}: {e}", dashboard_name)))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_route(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &(impl Topology + K8sclient),
|
||||||
|
) -> Result<(), InterpretError> {
|
||||||
|
let route = Route {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("grafana".to_string()),
|
||||||
|
namespace: Some(self.namespace.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
spec: crate::modules::okd::crd::route::RouteSpec {
|
||||||
|
to: crate::modules::okd::crd::route::RouteTargetReference {
|
||||||
|
kind: "Service".to_string(),
|
||||||
|
name: "cluster-grafana-service".to_string(),
|
||||||
|
weight: None,
|
||||||
|
},
|
||||||
|
port: Some(crate::modules::okd::crd::route::RoutePort {
|
||||||
|
target_port: 3000,
|
||||||
|
}),
|
||||||
|
tls: Some(crate::modules::okd::crd::route::TLSConfig {
|
||||||
|
termination: "edge".to_string(),
|
||||||
|
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
||||||
|
..crate::modules::okd::crd::route::TLSConfig::default()
|
||||||
|
}),
|
||||||
|
..crate::modules::okd::crd::route::RouteSpec::default()
|
||||||
|
},
|
||||||
|
..crate::modules::okd::crd::route::Route::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
K8sResourceScore::single(route, Some(self.namespace.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_name(&self) -> InterpretName {
|
||||||
|
InterpretName::Custom("ClusterDashboards")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_version(&self) -> Version {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_status(&self) -> InterpretStatus {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_children(&self) -> Vec<Id> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,7 +6,7 @@ use schemars::JsonSchema;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
interpret::{InterpretError, Outcome},
|
interpret::InterpretError,
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::{
|
modules::{
|
||||||
monitoring::{
|
monitoring::{
|
||||||
@@ -17,10 +17,10 @@ use crate::{
|
|||||||
topology::{
|
topology::{
|
||||||
K8sclient, Topology,
|
K8sclient, Topology,
|
||||||
installable::Installable,
|
installable::Installable,
|
||||||
k8s::K8sClient,
|
|
||||||
oberservability::monitoring::{AlertReceiver, AlertSender, ScrapeTarget},
|
oberservability::monitoring::{AlertReceiver, AlertSender, ScrapeTarget},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
|
|
||||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||||
#[kube(
|
#[kube(
|
||||||
|
|||||||
@@ -4,10 +4,8 @@ use kube::CustomResource;
|
|||||||
use schemars::JsonSchema;
|
use schemars::JsonSchema;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::topology::{
|
use crate::topology::oberservability::monitoring::{AlertReceiver, AlertSender};
|
||||||
k8s::K8sClient,
|
use harmony_k8s::K8sClient;
|
||||||
oberservability::monitoring::{AlertReceiver, AlertSender},
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||||
#[kube(
|
#[kube(
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
pub mod alert_channel;
|
pub mod alert_channel;
|
||||||
pub mod alert_rule;
|
pub mod alert_rule;
|
||||||
pub mod application_monitoring;
|
pub mod application_monitoring;
|
||||||
|
pub mod cluster_dashboards;
|
||||||
pub mod grafana;
|
pub mod grafana;
|
||||||
pub mod kube_prometheus;
|
pub mod kube_prometheus;
|
||||||
pub mod ntfy;
|
pub mod ntfy;
|
||||||
|
|||||||
@@ -11,8 +11,9 @@ use crate::{
|
|||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
modules::monitoring::ntfy::helm::ntfy_helm_chart::ntfy_helm_chart_score,
|
modules::monitoring::ntfy::helm::ntfy_helm_chart::ntfy_helm_chart_score,
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology, k8s::K8sClient},
|
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology},
|
||||||
};
|
};
|
||||||
|
use harmony_k8s::K8sClient;
|
||||||
use harmony_types::id::Id;
|
use harmony_types::id::Id;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use std::{collections::BTreeMap, sync::Arc};
|
use std::{collections::BTreeMap, sync::Arc};
|
||||||
|
|
||||||
use crate::{
|
use crate::interpret::{InterpretError, Outcome};
|
||||||
interpret::{InterpretError, Outcome},
|
use harmony_k8s::K8sClient;
|
||||||
topology::k8s::K8sClient,
|
|
||||||
};
|
|
||||||
use k8s_openapi::api::core::v1::ConfigMap;
|
use k8s_openapi::api::core::v1::ConfigMap;
|
||||||
use kube::api::ObjectMeta;
|
use kube::api::ObjectMeta;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::{collections::BTreeMap, str::FromStr};
|
use std::{collections::BTreeMap, str::FromStr};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use harmony_k8s::KubernetesDistribution;
|
||||||
use harmony_macros::hurl;
|
use harmony_macros::hurl;
|
||||||
use harmony_secret::{Secret, SecretManager};
|
use harmony_secret::{Secret, SecretManager};
|
||||||
use harmony_types::id::Id;
|
use harmony_types::id::Id;
|
||||||
@@ -25,7 +26,7 @@ use crate::{
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{HelmCommand, K8sclient, KubernetesDistribution, TlsRouter, Topology},
|
topology::{HelmCommand, K8sclient, TlsRouter, Topology},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
|||||||
260
harmony/src/modules/node_health/mod.rs
Normal file
260
harmony/src/modules/node_health/mod.rs
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use k8s_openapi::api::{
|
||||||
|
apps::v1::{DaemonSet, DaemonSetSpec},
|
||||||
|
core::v1::{
|
||||||
|
Container, ContainerPort, EnvVar, EnvVarSource, Namespace, ObjectFieldSelector, PodSpec,
|
||||||
|
PodTemplateSpec, ResourceRequirements, ServiceAccount, Toleration,
|
||||||
|
},
|
||||||
|
rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, Role, RoleBinding, RoleRef, Subject},
|
||||||
|
};
|
||||||
|
use k8s_openapi::apimachinery::pkg::api::resource::Quantity;
|
||||||
|
use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
|
||||||
|
use kube::api::ObjectMeta;
|
||||||
|
use serde::Serialize;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
data::Version,
|
||||||
|
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::k8s::resource::K8sResourceScore,
|
||||||
|
score::Score,
|
||||||
|
topology::{K8sclient, Topology},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct NodeHealthScore {}
|
||||||
|
|
||||||
|
impl<T: Topology + K8sclient> Score<T> for NodeHealthScore {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
format!("NodeHealthScore")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[doc(hidden)]
|
||||||
|
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||||
|
Box::new(NodeHealthInterpret {})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct NodeHealthInterpret {}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||||
|
async fn execute(
|
||||||
|
&self,
|
||||||
|
inventory: &Inventory,
|
||||||
|
topology: &T,
|
||||||
|
) -> Result<Outcome, InterpretError> {
|
||||||
|
let namespace_name = "harmony-node-healthcheck".to_string();
|
||||||
|
|
||||||
|
// Namespace
|
||||||
|
let mut labels = BTreeMap::new();
|
||||||
|
labels.insert("name".to_string(), namespace_name.clone());
|
||||||
|
|
||||||
|
let namespace = Namespace {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(namespace_name.clone()),
|
||||||
|
labels: Some(labels),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
..Namespace::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// ServiceAccount
|
||||||
|
let service_account_name = "node-healthcheck-sa".to_string();
|
||||||
|
let service_account = ServiceAccount {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(service_account_name.clone()),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
..ServiceAccount::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// ClusterRole
|
||||||
|
let cluster_role = ClusterRole {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("node-healthcheck-role".to_string()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
rules: Some(vec![PolicyRule {
|
||||||
|
api_groups: Some(vec!["".to_string()]),
|
||||||
|
resources: Some(vec!["nodes".to_string()]),
|
||||||
|
verbs: vec!["get".to_string(), "list".to_string()],
|
||||||
|
..PolicyRule::default()
|
||||||
|
}]),
|
||||||
|
..ClusterRole::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Role
|
||||||
|
let role = Role {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("allow-hostnetwork-scc".to_string()),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
rules: Some(vec![PolicyRule {
|
||||||
|
api_groups: Some(vec!["security.openshift.io".to_string()]),
|
||||||
|
resources: Some(vec!["securitycontextconstraints".to_string()]),
|
||||||
|
resource_names: Some(vec!["hostnetwork".to_string()]),
|
||||||
|
verbs: vec!["use".to_string()],
|
||||||
|
..PolicyRule::default()
|
||||||
|
}]),
|
||||||
|
..Role::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// RoleBinding
|
||||||
|
let role_binding = RoleBinding {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("node-status-querier-scc-binding".to_string()),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
subjects: Some(vec![Subject {
|
||||||
|
kind: "ServiceAccount".to_string(),
|
||||||
|
name: service_account_name.clone(),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
..Subject::default()
|
||||||
|
}]),
|
||||||
|
role_ref: RoleRef {
|
||||||
|
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||||
|
kind: "Role".to_string(),
|
||||||
|
name: "allow-hostnetwork-scc".to_string(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// ClusterRoleBinding
|
||||||
|
let cluster_role_binding = ClusterRoleBinding {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("read-nodes-binding".to_string()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
subjects: Some(vec![Subject {
|
||||||
|
kind: "ServiceAccount".to_string(),
|
||||||
|
name: service_account_name.clone(),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
..Subject::default()
|
||||||
|
}]),
|
||||||
|
role_ref: RoleRef {
|
||||||
|
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||||
|
kind: "ClusterRole".to_string(),
|
||||||
|
name: "node-healthcheck-role".to_string(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// DaemonSet
|
||||||
|
let mut daemonset_labels = BTreeMap::new();
|
||||||
|
daemonset_labels.insert("app".to_string(), "node-healthcheck".to_string());
|
||||||
|
|
||||||
|
let daemon_set = DaemonSet {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some("node-healthcheck".to_string()),
|
||||||
|
namespace: Some(namespace_name.clone()),
|
||||||
|
labels: Some(daemonset_labels.clone()),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
},
|
||||||
|
spec: Some(DaemonSetSpec {
|
||||||
|
selector: LabelSelector {
|
||||||
|
match_labels: Some(daemonset_labels.clone()),
|
||||||
|
..LabelSelector::default()
|
||||||
|
},
|
||||||
|
template: PodTemplateSpec {
|
||||||
|
metadata: Some(ObjectMeta {
|
||||||
|
labels: Some(daemonset_labels),
|
||||||
|
..ObjectMeta::default()
|
||||||
|
}),
|
||||||
|
spec: Some(PodSpec {
|
||||||
|
service_account_name: Some(service_account_name.clone()),
|
||||||
|
host_network: Some(true),
|
||||||
|
tolerations: Some(vec![Toleration {
|
||||||
|
operator: Some("Exists".to_string()),
|
||||||
|
..Toleration::default()
|
||||||
|
}]),
|
||||||
|
containers: vec![Container {
|
||||||
|
name: "checker".to_string(),
|
||||||
|
image: Some(
|
||||||
|
"hub.nationtech.io/harmony/harmony-node-readiness-endpoint:latest"
|
||||||
|
.to_string(),
|
||||||
|
),
|
||||||
|
env: Some(vec![EnvVar {
|
||||||
|
name: "NODE_NAME".to_string(),
|
||||||
|
value_from: Some(EnvVarSource {
|
||||||
|
field_ref: Some(ObjectFieldSelector {
|
||||||
|
field_path: "spec.nodeName".to_string(),
|
||||||
|
..ObjectFieldSelector::default()
|
||||||
|
}),
|
||||||
|
..EnvVarSource::default()
|
||||||
|
}),
|
||||||
|
..EnvVar::default()
|
||||||
|
}]),
|
||||||
|
ports: Some(vec![ContainerPort {
|
||||||
|
container_port: 25001,
|
||||||
|
host_port: Some(25001),
|
||||||
|
name: Some("health-port".to_string()),
|
||||||
|
..ContainerPort::default()
|
||||||
|
}]),
|
||||||
|
resources: Some(ResourceRequirements {
|
||||||
|
requests: Some({
|
||||||
|
let mut requests = BTreeMap::new();
|
||||||
|
requests.insert("cpu".to_string(), Quantity("10m".to_string()));
|
||||||
|
requests
|
||||||
|
.insert("memory".to_string(), Quantity("50Mi".to_string()));
|
||||||
|
requests
|
||||||
|
}),
|
||||||
|
..ResourceRequirements::default()
|
||||||
|
}),
|
||||||
|
..Container::default()
|
||||||
|
}],
|
||||||
|
..PodSpec::default()
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
..DaemonSetSpec::default()
|
||||||
|
}),
|
||||||
|
..DaemonSet::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
K8sResourceScore::single(namespace, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(service_account, Some(namespace_name.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(cluster_role, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(role, Some(namespace_name.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(role_binding, Some(namespace_name.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(cluster_role_binding, None)
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
K8sResourceScore::single(daemon_set, Some(namespace_name.clone()))
|
||||||
|
.interpret(inventory, topology)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(Outcome::success(
|
||||||
|
"Harmony node health successfully deployed".to_string(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_name(&self) -> InterpretName {
|
||||||
|
InterpretName::Custom("NodeHealth")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_version(&self) -> Version {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_status(&self) -> InterpretStatus {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_children(&self) -> Vec<Id> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -41,6 +41,7 @@ impl OKDBootstrapLoadBalancerScore {
|
|||||||
backend_servers: Self::topology_to_backend_server(topology, 6443),
|
backend_servers: Self::topology_to_backend_server(topology, 6443),
|
||||||
listening_port: SocketAddr::new(private_ip, 6443),
|
listening_port: SocketAddr::new(private_ip, 6443),
|
||||||
health_check: Some(HealthCheck::HTTP(
|
health_check: Some(HealthCheck::HTTP(
|
||||||
|
None,
|
||||||
"/readyz".to_string(),
|
"/readyz".to_string(),
|
||||||
HttpMethod::GET,
|
HttpMethod::GET,
|
||||||
HttpStatusCode::Success2xx,
|
HttpStatusCode::Success2xx,
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use crate::{
|
|||||||
score::Score,
|
score::Score,
|
||||||
topology::{
|
topology::{
|
||||||
BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer,
|
BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer,
|
||||||
LoadBalancerService, SSL, Topology,
|
LoadBalancerService, LogicalHost, Router, SSL, Topology,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -23,32 +23,72 @@ pub struct OKDLoadBalancerScore {
|
|||||||
load_balancer_score: LoadBalancerScore,
|
load_balancer_score: LoadBalancerScore,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// OKD Load Balancer Score configuration
|
||||||
|
///
|
||||||
|
/// This module configures the load balancer for OKD (OpenShift Kubernetes Distribution)
|
||||||
|
/// bare metal installations.
|
||||||
|
///
|
||||||
|
/// # Backend Server Configuration
|
||||||
|
///
|
||||||
|
/// For ports 80 and 443 (ingress traffic), the load balancer includes both control plane
|
||||||
|
/// and worker nodes in the backend pool. This is consistent with OKD's requirement that
|
||||||
|
/// ingress traffic should be load balanced across all nodes that may run ingress router pods.
|
||||||
|
///
|
||||||
|
/// For ports 22623 (Ignition API) and 6443 (Kubernetes API), only control plane nodes
|
||||||
|
/// are included as backends, as these services are control plane specific.
|
||||||
|
///
|
||||||
|
/// # References
|
||||||
|
///
|
||||||
|
/// - [OKD Bare Metal Installation - External Load Balancer Configuration]
|
||||||
|
/// (<https://docs.okd.io/latest/installing/installing_bare_metal/ipi/ipi-install-installation-workflow.html#nw-osp-configuring-external-load-balancer_ipi-install-installation-workflow>)
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```ignore
|
||||||
|
/// use harmony::topology::HAClusterTopology;
|
||||||
|
/// use harmony::modules::okd::OKDLoadBalancerScore;
|
||||||
|
///
|
||||||
|
/// let topology: HAClusterTopology = /* get topology from your infrastructure */;
|
||||||
|
/// let score = OKDLoadBalancerScore::new(&topology);
|
||||||
|
/// ```
|
||||||
impl OKDLoadBalancerScore {
|
impl OKDLoadBalancerScore {
|
||||||
pub fn new(topology: &HAClusterTopology) -> Self {
|
pub fn new(topology: &HAClusterTopology) -> Self {
|
||||||
let public_ip = topology.router.get_gateway();
|
let public_ip = topology.router.get_gateway();
|
||||||
let public_services = vec![
|
let public_services = vec![
|
||||||
LoadBalancerService {
|
LoadBalancerService {
|
||||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||||
listening_port: SocketAddr::new(public_ip, 80),
|
listening_port: SocketAddr::new(public_ip, 80),
|
||||||
health_check: Some(HealthCheck::TCP(None)),
|
health_check: None,
|
||||||
},
|
},
|
||||||
LoadBalancerService {
|
LoadBalancerService {
|
||||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||||
listening_port: SocketAddr::new(public_ip, 443),
|
listening_port: SocketAddr::new(public_ip, 443),
|
||||||
health_check: Some(HealthCheck::TCP(None)),
|
health_check: None,
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
let private_services = vec![
|
let private_services = vec![
|
||||||
LoadBalancerService {
|
LoadBalancerService {
|
||||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||||
listening_port: SocketAddr::new(public_ip, 80),
|
listening_port: SocketAddr::new(public_ip, 80),
|
||||||
health_check: Some(HealthCheck::TCP(None)),
|
health_check: Some(HealthCheck::HTTP(
|
||||||
|
Some(25001),
|
||||||
|
"/health?check=okd_router_1936,node_ready".to_string(),
|
||||||
|
HttpMethod::GET,
|
||||||
|
HttpStatusCode::Success2xx,
|
||||||
|
SSL::Default,
|
||||||
|
)),
|
||||||
},
|
},
|
||||||
LoadBalancerService {
|
LoadBalancerService {
|
||||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||||
listening_port: SocketAddr::new(public_ip, 443),
|
listening_port: SocketAddr::new(public_ip, 443),
|
||||||
health_check: Some(HealthCheck::TCP(None)),
|
health_check: Some(HealthCheck::HTTP(
|
||||||
|
Some(25001),
|
||||||
|
"/health?check=okd_router_1936,node_ready".to_string(),
|
||||||
|
HttpMethod::GET,
|
||||||
|
HttpStatusCode::Success2xx,
|
||||||
|
SSL::Default,
|
||||||
|
)),
|
||||||
},
|
},
|
||||||
LoadBalancerService {
|
LoadBalancerService {
|
||||||
backend_servers: Self::control_plane_to_backend_server(topology, 22623),
|
backend_servers: Self::control_plane_to_backend_server(topology, 22623),
|
||||||
@@ -59,6 +99,7 @@ impl OKDLoadBalancerScore {
|
|||||||
backend_servers: Self::control_plane_to_backend_server(topology, 6443),
|
backend_servers: Self::control_plane_to_backend_server(topology, 6443),
|
||||||
listening_port: SocketAddr::new(public_ip, 6443),
|
listening_port: SocketAddr::new(public_ip, 6443),
|
||||||
health_check: Some(HealthCheck::HTTP(
|
health_check: Some(HealthCheck::HTTP(
|
||||||
|
None,
|
||||||
"/readyz".to_string(),
|
"/readyz".to_string(),
|
||||||
HttpMethod::GET,
|
HttpMethod::GET,
|
||||||
HttpStatusCode::Success2xx,
|
HttpStatusCode::Success2xx,
|
||||||
@@ -74,6 +115,11 @@ impl OKDLoadBalancerScore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates backend servers list for control plane nodes only
|
||||||
|
///
|
||||||
|
/// Use this for control plane-specific services like:
|
||||||
|
/// - Port 22623: Ignition API (machine configuration during bootstrap)
|
||||||
|
/// - Port 6443: Kubernetes API server
|
||||||
fn control_plane_to_backend_server(
|
fn control_plane_to_backend_server(
|
||||||
topology: &HAClusterTopology,
|
topology: &HAClusterTopology,
|
||||||
port: u16,
|
port: u16,
|
||||||
@@ -87,6 +133,194 @@ impl OKDLoadBalancerScore {
|
|||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates backend servers list for all nodes (control plane + workers)
|
||||||
|
///
|
||||||
|
/// Use this for ingress traffic that should be distributed across all nodes:
|
||||||
|
/// - Port 80: HTTP ingress traffic
|
||||||
|
/// - Port 443: HTTPS ingress traffic
|
||||||
|
///
|
||||||
|
/// In OKD, ingress router pods can run on any node, so both control plane
|
||||||
|
/// and worker nodes should be included in the load balancer backend pool.
|
||||||
|
fn nodes_to_backend_server(topology: &HAClusterTopology, port: u16) -> Vec<BackendServer> {
|
||||||
|
let mut nodes = Vec::new();
|
||||||
|
for cp in &topology.control_plane {
|
||||||
|
nodes.push(BackendServer {
|
||||||
|
address: cp.ip.to_string(),
|
||||||
|
port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for worker in &topology.workers {
|
||||||
|
nodes.push(BackendServer {
|
||||||
|
address: worker.ip.to_string(),
|
||||||
|
port,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
nodes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::sync::{Arc, OnceLock};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::topology::DummyInfra;
|
||||||
|
use harmony_macros::ip;
|
||||||
|
use harmony_types::net::IpAddress;
|
||||||
|
|
||||||
|
fn create_test_topology() -> HAClusterTopology {
|
||||||
|
let router = Arc::new(DummyRouter {
|
||||||
|
gateway: ip!("192.168.1.1"),
|
||||||
|
});
|
||||||
|
|
||||||
|
HAClusterTopology {
|
||||||
|
domain_name: "test.example.com".to_string(),
|
||||||
|
router,
|
||||||
|
load_balancer: Arc::new(DummyInfra),
|
||||||
|
firewall: Arc::new(DummyInfra),
|
||||||
|
dhcp_server: Arc::new(DummyInfra),
|
||||||
|
tftp_server: Arc::new(DummyInfra),
|
||||||
|
http_server: Arc::new(DummyInfra),
|
||||||
|
dns_server: Arc::new(DummyInfra),
|
||||||
|
node_exporter: Arc::new(DummyInfra),
|
||||||
|
switch_client: Arc::new(DummyInfra),
|
||||||
|
bootstrap_host: LogicalHost {
|
||||||
|
ip: ip!("192.168.1.100"),
|
||||||
|
name: "bootstrap".to_string(),
|
||||||
|
},
|
||||||
|
control_plane: vec![
|
||||||
|
LogicalHost {
|
||||||
|
ip: ip!("192.168.1.10"),
|
||||||
|
name: "control-plane-0".to_string(),
|
||||||
|
},
|
||||||
|
LogicalHost {
|
||||||
|
ip: ip!("192.168.1.11"),
|
||||||
|
name: "control-plane-1".to_string(),
|
||||||
|
},
|
||||||
|
LogicalHost {
|
||||||
|
ip: ip!("192.168.1.12"),
|
||||||
|
name: "control-plane-2".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
workers: vec![
|
||||||
|
LogicalHost {
|
||||||
|
ip: ip!("192.168.1.20"),
|
||||||
|
name: "worker-0".to_string(),
|
||||||
|
},
|
||||||
|
LogicalHost {
|
||||||
|
ip: ip!("192.168.1.21"),
|
||||||
|
name: "worker-1".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
kubeconfig: None,
|
||||||
|
network_manager: OnceLock::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DummyRouter {
|
||||||
|
gateway: IpAddress,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Router for DummyRouter {
|
||||||
|
fn get_gateway(&self) -> IpAddress {
|
||||||
|
self.gateway
|
||||||
|
}
|
||||||
|
fn get_cidr(&self) -> cidr::Ipv4Cidr {
|
||||||
|
let ipv4 = match self.gateway {
|
||||||
|
IpAddress::V4(ip) => ip,
|
||||||
|
IpAddress::V6(_) => panic!("IPv6 not supported"),
|
||||||
|
};
|
||||||
|
cidr::Ipv4Cidr::new(ipv4, 24).unwrap()
|
||||||
|
}
|
||||||
|
fn get_host(&self) -> LogicalHost {
|
||||||
|
LogicalHost {
|
||||||
|
ip: self.gateway,
|
||||||
|
name: "router".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_nodes_to_backend_server_includes_control_plane_and_workers() {
|
||||||
|
let topology = create_test_topology();
|
||||||
|
|
||||||
|
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 80);
|
||||||
|
|
||||||
|
assert_eq!(backend_servers.len(), 5);
|
||||||
|
|
||||||
|
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||||
|
assert!(addresses.contains(&"192.168.1.10"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.11"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.12"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.20"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.21"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_control_plane_to_backend_server_only_includes_control_plane() {
|
||||||
|
let topology = create_test_topology();
|
||||||
|
|
||||||
|
let backend_servers = OKDLoadBalancerScore::control_plane_to_backend_server(&topology, 80);
|
||||||
|
|
||||||
|
assert_eq!(backend_servers.len(), 3);
|
||||||
|
|
||||||
|
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||||
|
assert!(addresses.contains(&"192.168.1.10"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.11"));
|
||||||
|
assert!(addresses.contains(&"192.168.1.12"));
|
||||||
|
assert!(!addresses.contains(&"192.168.1.20"));
|
||||||
|
assert!(!addresses.contains(&"192.168.1.21"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_public_services_include_all_nodes_on_port_80_and_443() {
|
||||||
|
let topology = create_test_topology();
|
||||||
|
let score = OKDLoadBalancerScore::new(&topology);
|
||||||
|
|
||||||
|
let public_service_80 = score
|
||||||
|
.load_balancer_score
|
||||||
|
.public_services
|
||||||
|
.iter()
|
||||||
|
.find(|s| s.listening_port.port() == 80)
|
||||||
|
.expect("Public service on port 80 not found");
|
||||||
|
|
||||||
|
let public_service_443 = score
|
||||||
|
.load_balancer_score
|
||||||
|
.public_services
|
||||||
|
.iter()
|
||||||
|
.find(|s| s.listening_port.port() == 443)
|
||||||
|
.expect("Public service on port 443 not found");
|
||||||
|
|
||||||
|
assert_eq!(public_service_80.backend_servers.len(), 5);
|
||||||
|
assert_eq!(public_service_443.backend_servers.len(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_private_service_port_22623_only_control_plane() {
|
||||||
|
let topology = create_test_topology();
|
||||||
|
let score = OKDLoadBalancerScore::new(&topology);
|
||||||
|
|
||||||
|
let private_service_22623 = score
|
||||||
|
.load_balancer_score
|
||||||
|
.private_services
|
||||||
|
.iter()
|
||||||
|
.find(|s| s.listening_port.port() == 22623)
|
||||||
|
.expect("Private service on port 22623 not found");
|
||||||
|
|
||||||
|
assert_eq!(private_service_22623.backend_servers.len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_backend_servers_have_correct_port() {
|
||||||
|
let topology = create_test_topology();
|
||||||
|
|
||||||
|
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 443);
|
||||||
|
|
||||||
|
for server in backend_servers {
|
||||||
|
assert_eq!(server.port, 443);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Topology + LoadBalancer> Score<T> for OKDLoadBalancerScore {
|
impl<T: Topology + LoadBalancer> Score<T> for OKDLoadBalancerScore {
|
||||||
|
|||||||
88
harmony/src/modules/openbao/mod.rs
Normal file
88
harmony/src/modules/openbao/mod.rs
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use harmony_macros::hurl;
|
||||||
|
use non_blank_string_rs::NonBlankString;
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
interpret::Interpret,
|
||||||
|
modules::helm::chart::{HelmChartScore, HelmRepository},
|
||||||
|
score::Score,
|
||||||
|
topology::{HelmCommand, K8sclient, Topology},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Clone)]
|
||||||
|
pub struct OpenbaoScore {
|
||||||
|
/// Host used for external access (ingress)
|
||||||
|
pub host: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Topology + K8sclient + HelmCommand> Score<T> for OpenbaoScore {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
"OpenbaoScore".to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[doc(hidden)]
|
||||||
|
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||||
|
// TODO exec pod commands to initialize secret store if not already done
|
||||||
|
let host = &self.host;
|
||||||
|
|
||||||
|
let values_yaml = Some(format!(
|
||||||
|
r#"global:
|
||||||
|
openshift: true
|
||||||
|
server:
|
||||||
|
standalone:
|
||||||
|
enabled: true
|
||||||
|
config: |
|
||||||
|
ui = true
|
||||||
|
|
||||||
|
listener "tcp" {{
|
||||||
|
tls_disable = true
|
||||||
|
address = "[::]:8200"
|
||||||
|
cluster_address = "[::]:8201"
|
||||||
|
}}
|
||||||
|
|
||||||
|
storage "file" {{
|
||||||
|
path = "/openbao/data"
|
||||||
|
}}
|
||||||
|
|
||||||
|
service:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hosts:
|
||||||
|
- host: {host}
|
||||||
|
dataStorage:
|
||||||
|
enabled: true
|
||||||
|
size: 10Gi
|
||||||
|
storageClass: null
|
||||||
|
accessMode: ReadWriteOnce
|
||||||
|
|
||||||
|
auditStorage:
|
||||||
|
enabled: true
|
||||||
|
size: 10Gi
|
||||||
|
storageClass: null
|
||||||
|
accessMode: ReadWriteOnce
|
||||||
|
ui:
|
||||||
|
enabled: true"#
|
||||||
|
));
|
||||||
|
|
||||||
|
HelmChartScore {
|
||||||
|
namespace: Some(NonBlankString::from_str("openbao").unwrap()),
|
||||||
|
release_name: NonBlankString::from_str("openbao").unwrap(),
|
||||||
|
chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
|
||||||
|
chart_version: None,
|
||||||
|
values_overrides: None,
|
||||||
|
values_yaml,
|
||||||
|
create_namespace: true,
|
||||||
|
install_only: false,
|
||||||
|
repository: Some(HelmRepository::new(
|
||||||
|
"openbao".to_string(),
|
||||||
|
hurl!("https://openbao.github.io/openbao-helm"),
|
||||||
|
true,
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
.create_interpret()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use kube::{CustomResource, api::ObjectMeta};
|
use kube::{CustomResource, api::ObjectMeta};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
@@ -13,9 +15,14 @@ use serde::{Deserialize, Serialize};
|
|||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ClusterSpec {
|
pub struct ClusterSpec {
|
||||||
pub instances: u32,
|
pub instances: u32,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub image_name: Option<String>,
|
pub image_name: Option<String>,
|
||||||
pub storage: Storage,
|
pub storage: Storage,
|
||||||
pub bootstrap: Bootstrap,
|
pub bootstrap: Bootstrap,
|
||||||
|
/// This must be set to None if you want cnpg to generate a superuser secret
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub superuser_secret: Option<BTreeMap<String, String>>,
|
||||||
|
pub enable_superuser_access: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Cluster {
|
impl Default for Cluster {
|
||||||
@@ -34,6 +41,8 @@ impl Default for ClusterSpec {
|
|||||||
image_name: None,
|
image_name: None,
|
||||||
storage: Storage::default(),
|
storage: Storage::default(),
|
||||||
bootstrap: Bootstrap::default(),
|
bootstrap: Bootstrap::default(),
|
||||||
|
superuser_secret: None,
|
||||||
|
enable_superuser_access: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user