Compare commits
40 Commits
feat/zitad
...
feat/clust
| Author | SHA1 | Date | |
|---|---|---|---|
| b1ff4e4a0f | |||
| ee8f033143 | |||
| 1298ac9a18 | |||
| 53e361e84e | |||
| 220e0c2bb8 | |||
| 82e47d22a2 | |||
| fb17d7ed40 | |||
| d4bf80779e | |||
| 28dadf3a70 | |||
| 15c454aa65 | |||
| f9a3e51529 | |||
| d10598d01e | |||
| 61ba7257d0 | |||
| b0e9594d92 | |||
| 2a7fa466cc | |||
| f463cd1e94 | |||
| e1da7949ec | |||
| d0a1a73710 | |||
| bc2b328296 | |||
| a93896707f | |||
| 0e9b23a320 | |||
| f532ba2b40 | |||
| fafca31798 | |||
| 5412c34957 | |||
| 787cc8feab | |||
| ce041f495b | |||
| bfb86f63ce | |||
| 55de206523 | |||
| 64893a84f5 | |||
| f941672662 | |||
| a98113dd40 | |||
| 5db1a31d33 | |||
| f5aac67af8 | |||
| d7e5bf11d5 | |||
| 2e1f1b8447 | |||
| 2b157ad7fd | |||
| d920de34cf | |||
| 4276b9137b | |||
| 6ab88ab8d9 | |||
| 53d0704a35 |
2493
Cargo.lock
generated
2493
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,6 @@
|
||||
resolver = "2"
|
||||
members = [
|
||||
"private_repos/*",
|
||||
"examples/*",
|
||||
"harmony",
|
||||
"harmony_types",
|
||||
"harmony_macros",
|
||||
@@ -19,7 +18,7 @@ members = [
|
||||
"adr/agent_discovery/mdns",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy", "harmony_node_readiness",
|
||||
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -38,6 +37,8 @@ tokio = { version = "1.40", features = [
|
||||
"macros",
|
||||
"rt-multi-thread",
|
||||
] }
|
||||
tokio-retry = "0.3.0"
|
||||
tokio-util = "0.7.15"
|
||||
cidr = { features = ["serde"], version = "0.2" }
|
||||
russh = "0.45"
|
||||
russh-keys = "0.45"
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use super::BrocadeClient;
|
||||
use crate::{
|
||||
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
|
||||
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
|
||||
shell::BrocadeShell,
|
||||
PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
@@ -8,7 +8,7 @@ use regex::Regex;
|
||||
use crate::{
|
||||
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
|
||||
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
|
||||
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
|
||||
parse_brocade_mac_address, shell::BrocadeShell,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::cert_manager::{
|
||||
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
|
||||
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
|
||||
capability::CertificateManagementConfig, score_certificate::CertificateScore,
|
||||
score_issuer::CertificateIssuerScore,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
16
examples/cluster_dashboards/Cargo.toml
Normal file
16
examples/cluster_dashboards/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[workspace]
|
||||
|
||||
[package]
|
||||
name = "example-cluster-dashboards"
|
||||
edition = "2021"
|
||||
version = "0.1.0"
|
||||
license = "GNU AGPL v3"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||
log = "0.4"
|
||||
env_logger = "0.11"
|
||||
21
examples/cluster_dashboards/src/main.rs
Normal file
21
examples/cluster_dashboards/src/main.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(cluster_dashboards_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -10,9 +10,10 @@ publish = false
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
cidr.workspace = true
|
||||
tokio.workspace = true
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use harmony::topology::k8s::{DrainOptions, K8sClient};
|
||||
use harmony_k8s::{DrainOptions, K8sClient};
|
||||
use log::{info, trace};
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
@@ -10,9 +10,10 @@ publish = false
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
cidr.workspace = true
|
||||
tokio.workspace = true
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile};
|
||||
use harmony_k8s::{K8sClient, NodeFile};
|
||||
use log::{info, trace};
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
@@ -6,7 +6,10 @@ use harmony::{
|
||||
data::{FileContent, FilePath},
|
||||
modules::{
|
||||
inventory::HarmonyDiscoveryStrategy,
|
||||
okd::{installation::OKDInstallationPipeline, ipxe::OKDIpxeScore},
|
||||
okd::{
|
||||
installation::OKDInstallationPipeline, ipxe::OKDIpxeScore,
|
||||
load_balancer::OKDLoadBalancerScore,
|
||||
},
|
||||
},
|
||||
score::Score,
|
||||
topology::HAClusterTopology,
|
||||
@@ -32,6 +35,7 @@ async fn main() {
|
||||
scores
|
||||
.append(&mut OKDInstallationPipeline::get_all_scores(HarmonyDiscoveryStrategy::MDNS).await);
|
||||
|
||||
scores.push(Box::new(OKDLoadBalancerScore::new(&topology)));
|
||||
harmony_cli::run(inventory, topology, scores, None)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -5,7 +5,7 @@ use harmony::{
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let openbao = OpenbaoScore {
|
||||
host: String::new(),
|
||||
host: "openbao.sebastien.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
|
||||
@@ -9,7 +7,7 @@ use harmony::{
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
|
||||
let cnpg_operator = CloudNativePgOperatorScore::default();
|
||||
let cnpg_operator = CloudNativePgOperatorScore::default_openshift();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
|
||||
@@ -1,22 +1,13 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv4Addr},
|
||||
sync::Arc,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use cidr::Ipv4Cidr;
|
||||
use harmony::{
|
||||
executors::ExecutorError,
|
||||
hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
|
||||
infra::opnsense::OPNSenseManagementInterface,
|
||||
inventory::Inventory,
|
||||
modules::opnsense::node_exporter::NodeExporterScore,
|
||||
topology::{
|
||||
HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
|
||||
UnmanagedRouter, node_exporter::NodeExporter,
|
||||
},
|
||||
topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
|
||||
};
|
||||
use harmony_macros::{ip, ipv4, mac_address};
|
||||
use harmony_macros::ip;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct OpnSenseTopology {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{
|
||||
K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
|
||||
capability::PostgreSQLConfig,
|
||||
PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
|
||||
@@ -44,6 +44,7 @@ fn build_large_score() -> LoadBalancerScore {
|
||||
],
|
||||
listening_port: SocketAddr::V4(SocketAddrV4::new(ipv4!("192.168.0.0"), 49387)),
|
||||
health_check: Some(HealthCheck::HTTP(
|
||||
Some(1993),
|
||||
"/some_long_ass_path_to_see_how_it_is_displayed_but_it_has_to_be_even_longer"
|
||||
.to_string(),
|
||||
HttpMethod::GET,
|
||||
|
||||
14
examples/zitadel/Cargo.toml
Normal file
14
examples/zitadel/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "example-zitadel"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio.workspace = true
|
||||
url.workspace = true
|
||||
20
examples/zitadel/src/main.rs
Normal file
20
examples/zitadel/src/main.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use harmony::{
|
||||
inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let zitadel = ZitadelScore {
|
||||
host: "sso.sto1.nationtech.io".to_string(),
|
||||
zitadel_version: "v4.12.1".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(zitadel)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
Binary file not shown.
23
harmony-k8s/Cargo.toml
Normal file
23
harmony-k8s/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "harmony-k8s"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
kube.workspace = true
|
||||
k8s-openapi.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-retry.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_yaml.workspace = true
|
||||
log.workspace = true
|
||||
similar.workspace = true
|
||||
reqwest.workspace = true
|
||||
url.workspace = true
|
||||
inquire.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions.workspace = true
|
||||
593
harmony-k8s/src/apply.rs
Normal file
593
harmony-k8s/src/apply.rs
Normal file
@@ -0,0 +1,593 @@
|
||||
use kube::{
|
||||
Client, Error, Resource,
|
||||
api::{
|
||||
Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
|
||||
ResourceExt,
|
||||
},
|
||||
core::ErrorResponse,
|
||||
discovery::Scope,
|
||||
error::DiscoveryError,
|
||||
};
|
||||
use log::{debug, error, trace, warn};
|
||||
use serde::{Serialize, de::DeserializeOwned};
|
||||
use serde_json::Value;
|
||||
use similar::TextDiff;
|
||||
use url::Url;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::helper;
|
||||
use crate::types::WriteMode;
|
||||
|
||||
/// The field-manager token sent with every server-side apply request.
|
||||
pub const FIELD_MANAGER: &str = "harmony-k8s";
|
||||
|
||||
// ── Private helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
|
||||
fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
|
||||
serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
|
||||
.map_err(Error::SerdeError)
|
||||
}
|
||||
|
||||
/// Fetch the current resource, display a unified diff against `payload`, and
|
||||
/// return `()`. All output goes to stdout (same behaviour as before).
|
||||
///
|
||||
/// A 404 is treated as "resource would be created" — not an error.
|
||||
async fn show_dry_run<T: Serialize>(
|
||||
api: &Api<DynamicObject>,
|
||||
name: &str,
|
||||
payload: &T,
|
||||
) -> Result<(), Error> {
|
||||
let new_yaml = serde_yaml::to_string(payload)
|
||||
.unwrap_or_else(|_| "Failed to serialize new resource".to_string());
|
||||
|
||||
match api.get(name).await {
|
||||
Ok(current) => {
|
||||
println!("\nDry-run for resource: '{name}'");
|
||||
let mut current_val = serde_yaml::to_value(¤t).unwrap_or(serde_yaml::Value::Null);
|
||||
if let Some(map) = current_val.as_mapping_mut() {
|
||||
map.remove(&serde_yaml::Value::String("status".to_string()));
|
||||
}
|
||||
let current_yaml = serde_yaml::to_string(¤t_val)
|
||||
.unwrap_or_else(|_| "Failed to serialize current resource".to_string());
|
||||
|
||||
if current_yaml == new_yaml {
|
||||
println!("No changes detected.");
|
||||
} else {
|
||||
println!("Changes detected:");
|
||||
let diff = TextDiff::from_lines(¤t_yaml, &new_yaml);
|
||||
for change in diff.iter_all_changes() {
|
||||
let sign = match change.tag() {
|
||||
similar::ChangeTag::Delete => "-",
|
||||
similar::ChangeTag::Insert => "+",
|
||||
similar::ChangeTag::Equal => " ",
|
||||
};
|
||||
print!("{sign}{change}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||
println!("\nDry-run for new resource: '{name}'");
|
||||
println!("Resource does not exist. Would be created:");
|
||||
for line in new_yaml.lines() {
|
||||
println!("+{line}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to fetch resource '{name}' for dry-run: {e}");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
|
||||
async fn do_apply<T: Serialize + std::fmt::Debug>(
|
||||
api: &Api<DynamicObject>,
|
||||
name: &str,
|
||||
payload: &T,
|
||||
patch_params: &PatchParams,
|
||||
write_mode: &WriteMode,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
match write_mode {
|
||||
WriteMode::CreateOrUpdate => {
|
||||
// TODO refactor this arm to perform self.update and if fail with 404 self.create
|
||||
// This will avoid the repetition of the api.patch and api.create calls within this
|
||||
// function body. This makes the code more maintainable
|
||||
match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||
Ok(obj) => Ok(obj),
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||
debug!("Resource '{name}' not found via SSA, falling back to POST");
|
||||
let dyn_obj = to_dynamic(payload)?;
|
||||
api.create(&PostParams::default(), &dyn_obj)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Failed to create '{name}': {e}");
|
||||
e
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to apply '{name}': {e}");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
WriteMode::Create => {
|
||||
let dyn_obj = to_dynamic(payload)?;
|
||||
api.create(&PostParams::default(), &dyn_obj)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Failed to create '{name}': {e}");
|
||||
e
|
||||
})
|
||||
}
|
||||
WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||
Ok(obj) => Ok(obj),
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
|
||||
code: 404,
|
||||
message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
|
||||
reason: "NotFound".to_string(),
|
||||
status: "Failure".to_string(),
|
||||
})),
|
||||
Err(e) => {
|
||||
error!("Failed to update '{name}': {e}");
|
||||
Err(e)
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ── Public API ───────────────────────────────────────────────────────────────
|
||||
|
||||
impl K8sClient {
|
||||
/// Server-side apply: create if absent, update if present.
|
||||
/// Equivalent to `kubectl apply`.
|
||||
pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
|
||||
.await
|
||||
}
|
||||
|
||||
/// POST only — returns an error if the resource already exists.
|
||||
pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::Create)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Server-side apply only — returns an error if the resource does not exist.
|
||||
pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::Update)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn apply_with_strategy<K>(
|
||||
&self,
|
||||
resource: &K,
|
||||
namespace: Option<&str>,
|
||||
write_mode: WriteMode,
|
||||
) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
debug!(
|
||||
"apply_with_strategy: {:?} ns={:?}",
|
||||
resource.meta().name,
|
||||
namespace
|
||||
);
|
||||
trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
|
||||
|
||||
let dyntype = K::DynamicType::default();
|
||||
let gvk = GroupVersionKind {
|
||||
group: K::group(&dyntype).to_string(),
|
||||
version: K::version(&dyntype).to_string(),
|
||||
kind: K::kind(&dyntype).to_string(),
|
||||
};
|
||||
|
||||
let discovery = self.discovery().await?;
|
||||
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Cannot resolve GVK: {gvk:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let effective_ns = if caps.scope == Scope::Cluster {
|
||||
None
|
||||
} else {
|
||||
namespace.or_else(|| resource.meta().namespace.as_deref())
|
||||
};
|
||||
|
||||
let api: Api<DynamicObject> =
|
||||
get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||
|
||||
let name = resource
|
||||
.meta()
|
||||
.name
|
||||
.as_deref()
|
||||
.expect("Kubernetes resource must have a name");
|
||||
|
||||
if self.dry_run {
|
||||
show_dry_run(&api, name, resource).await?;
|
||||
return Ok(resource.clone());
|
||||
}
|
||||
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
do_apply(&api, name, resource, &patch_params, &write_mode)
|
||||
.await
|
||||
.and_then(helper::dyn_to_typed)
|
||||
}
|
||||
|
||||
/// Applies resources in order, one at a time
|
||||
pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let mut result = Vec::new();
|
||||
for r in resources.iter() {
|
||||
let res = self.apply(r, ns).await;
|
||||
if res.is_err() {
|
||||
// NOTE: this may log sensitive data; downgrade to debug if needed.
|
||||
warn!(
|
||||
"Failed to apply k8s resource: {}",
|
||||
serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
|
||||
);
|
||||
}
|
||||
result.push(res?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Apply a [`DynamicObject`] resource using server-side apply.
|
||||
pub async fn apply_dynamic(
|
||||
&self,
|
||||
resource: &DynamicObject,
|
||||
namespace: Option<&str>,
|
||||
force_conflicts: bool,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
|
||||
|
||||
let discovery = self.discovery().await?;
|
||||
let type_meta = resource.types.as_ref().ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||
"DynamicObject must have types (apiVersion and kind)".to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||
"Invalid GVK in DynamicObject: {type_meta:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Cannot resolve GVK: {gvk:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let effective_ns = if caps.scope == Scope::Cluster {
|
||||
None
|
||||
} else {
|
||||
namespace.or_else(|| resource.metadata.namespace.as_deref())
|
||||
};
|
||||
|
||||
let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||
let name = resource.metadata.name.as_deref().ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||
"DynamicObject must have metadata.name".to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
debug!(
|
||||
"apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
|
||||
resource.types.as_ref().map(|t| &t.kind),
|
||||
);
|
||||
|
||||
// NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
|
||||
// pasting the dry_run and some more logic
|
||||
if self.dry_run {
|
||||
show_dry_run(&api, name, resource).await?;
|
||||
return Ok(resource.clone());
|
||||
}
|
||||
|
||||
let mut patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
patch_params.force = force_conflicts;
|
||||
|
||||
do_apply(
|
||||
&api,
|
||||
name,
|
||||
resource,
|
||||
&patch_params,
|
||||
&WriteMode::CreateOrUpdate,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn apply_dynamic_many(
|
||||
&self,
|
||||
resources: &[DynamicObject],
|
||||
namespace: Option<&str>,
|
||||
force_conflicts: bool,
|
||||
) -> Result<Vec<DynamicObject>, Error> {
|
||||
let mut result = Vec::new();
|
||||
for r in resources.iter() {
|
||||
result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub async fn apply_yaml_many(
|
||||
&self,
|
||||
#[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
|
||||
ns: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
for y in yaml.iter() {
|
||||
self.apply_yaml(y, ns).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn apply_yaml(
|
||||
&self,
|
||||
yaml: &serde_yaml::Value,
|
||||
ns: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
// NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
|
||||
// apply_dynamic instead of reimplementing api interactions?
|
||||
let obj: DynamicObject =
|
||||
serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
|
||||
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
|
||||
|
||||
let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
|
||||
let kind = yaml["kind"].as_str().expect("missing kind");
|
||||
|
||||
let mut it = api_version.splitn(2, '/');
|
||||
let first = it.next().unwrap();
|
||||
let (g, v) = match it.next() {
|
||||
Some(second) => (first, second),
|
||||
None => ("", first),
|
||||
};
|
||||
|
||||
let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
|
||||
let namespace = ns.unwrap_or_else(|| {
|
||||
obj.metadata
|
||||
.namespace
|
||||
.as_deref()
|
||||
.expect("YAML must have a namespace when ns is not provided")
|
||||
});
|
||||
|
||||
let api: Api<DynamicObject> =
|
||||
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
|
||||
|
||||
println!("Applying '{name}' in namespace '{namespace}'...");
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
|
||||
println!("Successfully applied '{}'.", result.name_any());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Equivalent to `kubectl apply -f <url>`.
|
||||
pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
let discovery = self.discovery().await?;
|
||||
|
||||
let yaml = reqwest::get(url)
|
||||
.await
|
||||
.expect("Could not fetch URL")
|
||||
.text()
|
||||
.await
|
||||
.expect("Could not read response body");
|
||||
|
||||
for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
|
||||
let obj: DynamicObject =
|
||||
serde_yaml::from_value(doc).expect("YAML document is not a valid object");
|
||||
let namespace = obj.metadata.namespace.as_deref().or(ns);
|
||||
let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
|
||||
let gvk =
|
||||
GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
|
||||
let name = obj.name_any();
|
||||
|
||||
if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
|
||||
let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
|
||||
trace!(
|
||||
"Applying {}:\n{}",
|
||||
gvk.kind,
|
||||
serde_yaml::to_string(&obj).unwrap_or_default()
|
||||
);
|
||||
let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
|
||||
let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
|
||||
debug!("Applied {} '{name}'", gvk.kind);
|
||||
} else {
|
||||
warn!("Skipping document with unknown GVK: {gvk:?}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
|
||||
pub(crate) fn get_api_for_dynamic_object(
|
||||
&self,
|
||||
object: &DynamicObject,
|
||||
ns: Option<&str>,
|
||||
) -> Result<Api<DynamicObject>, Error> {
|
||||
let ar = object
|
||||
.types
|
||||
.as_ref()
|
||||
.and_then(|t| {
|
||||
let parts: Vec<&str> = t.api_version.split('/').collect();
|
||||
match parts.as_slice() {
|
||||
[version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||
"", version, &t.kind,
|
||||
))),
|
||||
[group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||
group, version, &t.kind,
|
||||
))),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||
"Invalid apiVersion in DynamicObject: {object:#?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
Ok(match ns {
|
||||
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Free functions ───────────────────────────────────────────────────────────
|
||||
|
||||
pub(crate) fn get_dynamic_api(
|
||||
resource: kube::api::ApiResource,
|
||||
capabilities: kube::discovery::ApiCapabilities,
|
||||
client: Client,
|
||||
ns: Option<&str>,
|
||||
all: bool,
|
||||
) -> Api<DynamicObject> {
|
||||
if capabilities.scope == Scope::Cluster || all {
|
||||
Api::all_with(client, &resource)
|
||||
} else if let Some(namespace) = ns {
|
||||
Api::namespaced_with(client, namespace, &resource)
|
||||
} else {
|
||||
Api::default_namespaced_with(client, &resource)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn multidoc_deserialize(
|
||||
data: &str,
|
||||
) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
|
||||
use serde::Deserialize;
|
||||
let mut docs = vec![];
|
||||
for de in serde_yaml::Deserializer::from_str(data) {
|
||||
docs.push(serde_yaml::Value::deserialize(de)?);
|
||||
}
|
||||
Ok(docs)
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod apply_tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use kube::api::{DeleteParams, TypeMeta};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_creates_new_configmap() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-cm-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(client.apply(&cm, Some(ns)).await.is_ok());
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_is_idempotent() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-idem-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(
|
||||
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||
"first apply failed"
|
||||
);
|
||||
assert!(
|
||||
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||
"second apply failed (not idempotent)"
|
||||
);
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_dynamic_creates_new_resource() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-dyn-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let obj = DynamicObject {
|
||||
types: Some(TypeMeta {
|
||||
api_version: "v1".to_string(),
|
||||
kind: "ConfigMap".to_string(),
|
||||
}),
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: serde_json::json!({}),
|
||||
};
|
||||
|
||||
let result = client.apply_dynamic(&obj, Some(ns), false).await;
|
||||
assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
}
|
||||
@@ -25,9 +25,9 @@
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use harmony::topology::k8s::{K8sClient, helper};
|
||||
//! use harmony::topology::KubernetesDistribution;
|
||||
//! ```
|
||||
//! use harmony_k8s::{K8sClient, helper};
|
||||
//! use harmony_k8s::KubernetesDistribution;
|
||||
//!
|
||||
//! async fn write_network_config(client: &K8sClient, node: &str) {
|
||||
//! // Create a bundle with platform-specific RBAC
|
||||
@@ -56,7 +56,7 @@ use kube::{Error, Resource, ResourceExt, api::DynamicObject};
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
|
||||
use crate::domain::topology::k8s::K8sClient;
|
||||
use crate::K8sClient;
|
||||
|
||||
/// A ResourceBundle represents a logical unit of work consisting of multiple
|
||||
/// Kubernetes resources that should be applied or deleted together.
|
||||
99
harmony-k8s/src/client.rs
Normal file
99
harmony-k8s/src/client.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use kube::config::{KubeConfigOptions, Kubeconfig};
|
||||
use kube::{Client, Config, Discovery, Error};
|
||||
use log::error;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::OnceCell;
|
||||
|
||||
use crate::types::KubernetesDistribution;
|
||||
|
||||
// TODO not cool, should use a proper configuration mechanism
|
||||
// cli arg, env var, config file
|
||||
fn read_dry_run_from_env() -> bool {
|
||||
std::env::var("DRY_RUN")
|
||||
.map(|v| v == "true" || v == "1")
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct K8sClient {
|
||||
pub(crate) client: Client,
|
||||
/// When `true` no mutation is sent to the API server; diffs are printed
|
||||
/// to stdout instead. Initialised from the `DRY_RUN` environment variable.
|
||||
pub(crate) dry_run: bool,
|
||||
pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
|
||||
pub(crate) discovery: Arc<OnceCell<Discovery>>,
|
||||
}
|
||||
|
||||
impl Serialize for K8sClient {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!("K8sClient serialization is not meaningful; remove this impl if unused")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for K8sClient {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_fmt(format_args!(
|
||||
"K8sClient {{ namespace: {}, dry_run: {} }}",
|
||||
self.client.default_namespace(),
|
||||
self.dry_run,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl K8sClient {
|
||||
/// Create a client, reading `DRY_RUN` from the environment.
|
||||
pub fn new(client: Client) -> Self {
|
||||
Self {
|
||||
dry_run: read_dry_run_from_env(),
|
||||
client,
|
||||
k8s_distribution: Arc::new(OnceCell::new()),
|
||||
discovery: Arc::new(OnceCell::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a client that always operates in dry-run mode, regardless of
|
||||
/// the environment variable.
|
||||
pub fn new_dry_run(client: Client) -> Self {
|
||||
Self {
|
||||
dry_run: true,
|
||||
..Self::new(client)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if this client is operating in dry-run mode.
|
||||
pub fn is_dry_run(&self) -> bool {
|
||||
self.dry_run
|
||||
}
|
||||
|
||||
pub async fn try_default() -> Result<Self, Error> {
|
||||
Ok(Self::new(Client::try_default().await?))
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig(path: &str) -> Option<Self> {
|
||||
Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
|
||||
let mut opts = KubeConfigOptions::default();
|
||||
opts.context = context;
|
||||
Self::from_kubeconfig_with_opts(path, &opts).await
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
|
||||
let k = match Kubeconfig::read_from(path) {
|
||||
Ok(k) => k,
|
||||
Err(e) => {
|
||||
error!("Failed to load kubeconfig from {path}: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(Self::new(
|
||||
Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
|
||||
))
|
||||
}
|
||||
}
|
||||
83
harmony-k8s/src/discovery.rs
Normal file
83
harmony-k8s/src/discovery.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use kube::{Discovery, Error};
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_retry::{Retry, strategy::ExponentialBackoff};
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::types::KubernetesDistribution;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn get_apiserver_version(
|
||||
&self,
|
||||
) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
|
||||
self.client.clone().apiserver_version().await
|
||||
}
|
||||
|
||||
/// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
|
||||
pub async fn discovery(&self) -> Result<&Discovery, Error> {
|
||||
let retry_strategy = ExponentialBackoff::from_millis(1000)
|
||||
.max_delay(Duration::from_secs(32))
|
||||
.take(6);
|
||||
|
||||
let attempt = Mutex::new(0u32);
|
||||
Retry::spawn(retry_strategy, || async {
|
||||
let mut n = attempt.lock().await;
|
||||
*n += 1;
|
||||
match self
|
||||
.discovery
|
||||
.get_or_try_init(async || {
|
||||
debug!("Running Kubernetes API discovery (attempt {})", *n);
|
||||
let d = Discovery::new(self.client.clone()).run().await?;
|
||||
debug!("Kubernetes API discovery completed");
|
||||
Ok(d)
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(d) => Ok(d),
|
||||
Err(e) => {
|
||||
warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Kubernetes API discovery failed after all retries: {}", e);
|
||||
e
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect which Kubernetes distribution is running. Result is cached for
|
||||
/// the lifetime of the client.
|
||||
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
|
||||
self.k8s_distribution
|
||||
.get_or_try_init(async || {
|
||||
debug!("Detecting Kubernetes distribution");
|
||||
let api_groups = self.client.list_api_groups().await?;
|
||||
trace!("list_api_groups: {:?}", api_groups);
|
||||
|
||||
let version = self.get_apiserver_version().await?;
|
||||
|
||||
if api_groups
|
||||
.groups
|
||||
.iter()
|
||||
.any(|g| g.name == "project.openshift.io")
|
||||
{
|
||||
info!("Detected distribution: OpenshiftFamily");
|
||||
return Ok(KubernetesDistribution::OpenshiftFamily);
|
||||
}
|
||||
|
||||
if version.git_version.contains("k3s") {
|
||||
info!("Detected distribution: K3sFamily");
|
||||
return Ok(KubernetesDistribution::K3sFamily);
|
||||
}
|
||||
|
||||
info!("Distribution not identified, using Default");
|
||||
Ok(KubernetesDistribution::Default)
|
||||
})
|
||||
.await
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::topology::KubernetesDistribution;
|
||||
use crate::KubernetesDistribution;
|
||||
|
||||
use super::bundle::ResourceBundle;
|
||||
use super::config::PRIVILEGED_POD_IMAGE;
|
||||
@@ -133,9 +133,9 @@ pub fn host_root_volume() -> (Volume, VolumeMount) {
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
||||
/// # use harmony::topology::KubernetesDistribution;
|
||||
/// ```
|
||||
/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
||||
/// use harmony_k8s::KubernetesDistribution;
|
||||
/// let bundle = build_privileged_bundle(
|
||||
/// PrivilegedPodConfig {
|
||||
/// name: "network-setup".to_string(),
|
||||
13
harmony-k8s/src/lib.rs
Normal file
13
harmony-k8s/src/lib.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
pub mod apply;
|
||||
pub mod bundle;
|
||||
pub mod client;
|
||||
pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod helper;
|
||||
pub mod node;
|
||||
pub mod pod;
|
||||
pub mod resources;
|
||||
pub mod types;
|
||||
|
||||
pub use client::K8sClient;
|
||||
pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};
|
||||
3
harmony-k8s/src/main.rs
Normal file
3
harmony-k8s/src/main.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
}
|
||||
722
harmony-k8s/src/node.rs
Normal file
722
harmony-k8s/src/node.rs
Normal file
@@ -0,0 +1,722 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use k8s_openapi::api::core::v1::{
|
||||
ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
|
||||
};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use kube::{
|
||||
Error,
|
||||
api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
|
||||
core::ErrorResponse,
|
||||
error::DiscoveryError,
|
||||
};
|
||||
use log::{debug, error, info, warn};
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::helper::{self, PrivilegedPodConfig};
|
||||
use crate::types::{DrainOptions, NodeFile};
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||
Api::<Node>::all(self.client.clone())
|
||||
.cordon(node_name)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||
Api::<Node>::all(self.client.clone())
|
||||
.uncordon(node_name)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
|
||||
self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn wait_for_node_ready_with_timeout(
|
||||
&self,
|
||||
node_name: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Node> = Api::all(self.client.clone());
|
||||
let start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
loop {
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not become Ready within {timeout:?}"
|
||||
))));
|
||||
}
|
||||
match api.get(node_name).await {
|
||||
Ok(node) => {
|
||||
if node
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|conds| {
|
||||
conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
{
|
||||
debug!("Node '{node_name}' is Ready");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||
}
|
||||
sleep(poll).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_node_not_ready(
|
||||
&self,
|
||||
node_name: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Node> = Api::all(self.client.clone());
|
||||
let start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
loop {
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not become NotReady within {timeout:?}"
|
||||
))));
|
||||
}
|
||||
match api.get(node_name).await {
|
||||
Ok(node) => {
|
||||
let is_ready = node
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|conds| {
|
||||
conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false);
|
||||
if !is_ready {
|
||||
debug!("Node '{node_name}' is NotReady");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||
}
|
||||
sleep(poll).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
|
||||
let api: Api<Pod> = Api::all(self.client.clone());
|
||||
Ok(api
|
||||
.list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
|
||||
.await?
|
||||
.items)
|
||||
}
|
||||
|
||||
fn is_mirror_pod(pod: &Pod) -> bool {
|
||||
pod.metadata
|
||||
.annotations
|
||||
.as_ref()
|
||||
.map(|a| a.contains_key("kubernetes.io/config.mirror"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_daemonset_pod(pod: &Pod) -> bool {
|
||||
pod.metadata
|
||||
.owner_references
|
||||
.as_ref()
|
||||
.map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn has_emptydir_volume(pod: &Pod) -> bool {
|
||||
pod.spec
|
||||
.as_ref()
|
||||
.and_then(|s| s.volumes.as_ref())
|
||||
.map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_completed_pod(pod: &Pod) -> bool {
|
||||
pod.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.phase.as_deref())
|
||||
.map(|phase| phase == "Succeeded" || phase == "Failed")
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn classify_pods_for_drain(
|
||||
pods: &[Pod],
|
||||
options: &DrainOptions,
|
||||
) -> Result<(Vec<Pod>, Vec<String>), String> {
|
||||
let mut evictable = Vec::new();
|
||||
let mut skipped = Vec::new();
|
||||
let mut blocking = Vec::new();
|
||||
|
||||
for pod in pods {
|
||||
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
|
||||
let qualified = format!("{ns}/{name}");
|
||||
|
||||
if Self::is_mirror_pod(pod) {
|
||||
skipped.push(format!("{qualified} (mirror pod)"));
|
||||
continue;
|
||||
}
|
||||
if Self::is_completed_pod(pod) {
|
||||
skipped.push(format!("{qualified} (completed)"));
|
||||
continue;
|
||||
}
|
||||
if Self::is_daemonset_pod(pod) {
|
||||
if options.ignore_daemonsets {
|
||||
skipped.push(format!("{qualified} (DaemonSet-managed)"));
|
||||
} else {
|
||||
blocking.push(format!(
|
||||
"{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
|
||||
));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
|
||||
blocking.push(format!(
|
||||
"{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
|
||||
));
|
||||
continue;
|
||||
}
|
||||
evictable.push(pod.clone());
|
||||
}
|
||||
|
||||
if !blocking.is_empty() {
|
||||
return Err(format!(
|
||||
"Cannot drain node — the following pods block eviction:\n - {}",
|
||||
blocking.join("\n - ")
|
||||
));
|
||||
}
|
||||
Ok((evictable, skipped))
|
||||
}
|
||||
|
||||
async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
|
||||
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||
debug!("Evicting pod {ns}/{name}");
|
||||
Api::<Pod>::namespaced(self.client.clone(), ns)
|
||||
.evict(name, &EvictParams::default())
|
||||
.await
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// Drains a node: cordon → classify → evict & wait.
|
||||
pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
|
||||
debug!("Cordoning '{node_name}'");
|
||||
self.cordon_node(node_name).await?;
|
||||
|
||||
let pods = self.list_pods_on_node(node_name).await?;
|
||||
debug!("Found {} pod(s) on '{node_name}'", pods.len());
|
||||
|
||||
let (evictable, skipped) =
|
||||
Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
|
||||
error!("{msg}");
|
||||
Error::Discovery(DiscoveryError::MissingResource(msg))
|
||||
})?;
|
||||
|
||||
for s in &skipped {
|
||||
info!("Skipping pod: {s}");
|
||||
}
|
||||
if evictable.is_empty() {
|
||||
info!("No pods to evict on '{node_name}'");
|
||||
return Ok(());
|
||||
}
|
||||
info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
|
||||
|
||||
let mut start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
let mut pending = evictable;
|
||||
|
||||
loop {
|
||||
for pod in &pending {
|
||||
match self.evict_pod(pod).await {
|
||||
Ok(()) => {}
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
|
||||
Err(Error::Api(ErrorResponse { code: 429, .. })) => {
|
||||
warn!(
|
||||
"PDB blocked eviction of {}/{}; will retry",
|
||||
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||
pod.metadata.name.as_deref().unwrap_or("")
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to evict {}/{}: {e}",
|
||||
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||
pod.metadata.name.as_deref().unwrap_or("")
|
||||
);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sleep(poll).await;
|
||||
|
||||
let mut still_present = Vec::new();
|
||||
for pod in pending {
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||
match self.get_pod(name, Some(ns)).await? {
|
||||
Some(_) => still_present.push(pod),
|
||||
None => debug!("Pod {ns}/{name} evicted"),
|
||||
}
|
||||
}
|
||||
pending = still_present;
|
||||
|
||||
if pending.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
if start.elapsed() > options.timeout {
|
||||
match helper::prompt_drain_timeout_action(
|
||||
node_name,
|
||||
pending.len(),
|
||||
options.timeout,
|
||||
)? {
|
||||
helper::DrainTimeoutAction::Accept => break,
|
||||
helper::DrainTimeoutAction::Retry => {
|
||||
start = tokio::time::Instant::now();
|
||||
continue;
|
||||
}
|
||||
helper::DrainTimeoutAction::Abort => {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Drain aborted. {} pod(s) remaining on '{node_name}'",
|
||||
pending.len()
|
||||
))));
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
|
||||
}
|
||||
|
||||
debug!("'{node_name}' drained successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
|
||||
pub async fn reboot_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
drain_options: &DrainOptions,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
info!("Starting reboot for '{node_name}'");
|
||||
let node_api: Api<Node> = Api::all(self.client.clone());
|
||||
|
||||
let boot_id_before = node_api
|
||||
.get(node_name)
|
||||
.await?
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.node_info.as_ref())
|
||||
.map(|ni| ni.boot_id.clone())
|
||||
.ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' has no boot_id in status"
|
||||
)))
|
||||
})?;
|
||||
|
||||
info!("Draining '{node_name}'");
|
||||
self.drain_node(node_name, drain_options).await?;
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
|
||||
info!("Scheduling reboot for '{node_name}'");
|
||||
let reboot_cmd =
|
||||
"echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
|
||||
match self
|
||||
.run_privileged_command_on_node(node_name, reboot_cmd)
|
||||
.await
|
||||
{
|
||||
Ok(_) => debug!("Reboot command dispatched"),
|
||||
Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
|
||||
}
|
||||
|
||||
info!("Waiting for '{node_name}' to begin shutdown");
|
||||
self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
|
||||
.await?;
|
||||
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timeout during reboot of '{node_name}' (shutdown phase)"
|
||||
))));
|
||||
}
|
||||
|
||||
info!("Waiting for '{node_name}' to come back online");
|
||||
self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
|
||||
.await?;
|
||||
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timeout during reboot of '{node_name}' (ready phase)"
|
||||
))));
|
||||
}
|
||||
|
||||
let boot_id_after = node_api
|
||||
.get(node_name)
|
||||
.await?
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.node_info.as_ref())
|
||||
.map(|ni| ni.boot_id.clone())
|
||||
.ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' has no boot_id after reboot"
|
||||
)))
|
||||
})?;
|
||||
|
||||
if boot_id_before == boot_id_after {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
|
||||
))));
|
||||
}
|
||||
|
||||
info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
|
||||
self.uncordon_node(node_name).await?;
|
||||
info!("'{node_name}' reboot complete ({:?})", start.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a set of files to a node's filesystem via a privileged ephemeral pod.
|
||||
pub async fn write_files_to_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
files: &[NodeFile],
|
||||
) -> Result<String, Error> {
|
||||
let ns = self.client.default_namespace();
|
||||
let suffix = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let name = format!("harmony-k8s-writer-{suffix}");
|
||||
|
||||
debug!("Writing {} file(s) to '{node_name}'", files.len());
|
||||
|
||||
let mut data = BTreeMap::new();
|
||||
let mut script = String::from("set -e\n");
|
||||
for (i, file) in files.iter().enumerate() {
|
||||
let key = format!("f{i}");
|
||||
data.insert(key.clone(), file.content.clone());
|
||||
script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
|
||||
script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
|
||||
script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
|
||||
}
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(data),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
|
||||
cm_api.create(&PostParams::default(), &cm).await?;
|
||||
debug!("Created ConfigMap '{name}'");
|
||||
|
||||
let (host_vol, host_mount) = helper::host_root_volume();
|
||||
let payload_vol = Volume {
|
||||
name: "payload".to_string(),
|
||||
config_map: Some(ConfigMapVolumeSource {
|
||||
name: name.clone(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let payload_mount = VolumeMount {
|
||||
name: "payload".to_string(),
|
||||
mount_path: "/payload".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let bundle = helper::build_privileged_bundle(
|
||||
PrivilegedPodConfig {
|
||||
name: name.clone(),
|
||||
namespace: ns.to_string(),
|
||||
node_name: node_name.to_string(),
|
||||
container_name: "writer".to_string(),
|
||||
command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
|
||||
volumes: vec![payload_vol, host_vol],
|
||||
volume_mounts: vec![payload_mount, host_mount],
|
||||
host_pid: false,
|
||||
host_network: false,
|
||||
},
|
||||
&self.get_k8s_distribution().await?,
|
||||
);
|
||||
|
||||
bundle.apply(self).await?;
|
||||
debug!("Created privileged pod bundle '{name}'");
|
||||
|
||||
let result = self.wait_for_pod_completion(&name, ns).await;
|
||||
|
||||
debug!("Cleaning up '{name}'");
|
||||
let _ = bundle.delete(self).await;
|
||||
let _ = cm_api.delete(&name, &DeleteParams::default()).await;
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Run a privileged command on a node via an ephemeral pod.
|
||||
pub async fn run_privileged_command_on_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
command: &str,
|
||||
) -> Result<String, Error> {
|
||||
let namespace = self.client.default_namespace();
|
||||
let suffix = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let name = format!("harmony-k8s-cmd-{suffix}");
|
||||
|
||||
debug!("Running privileged command on '{node_name}': {command}");
|
||||
|
||||
let (host_vol, host_mount) = helper::host_root_volume();
|
||||
let bundle = helper::build_privileged_bundle(
|
||||
PrivilegedPodConfig {
|
||||
name: name.clone(),
|
||||
namespace: namespace.to_string(),
|
||||
node_name: node_name.to_string(),
|
||||
container_name: "runner".to_string(),
|
||||
command: vec![
|
||||
"/bin/bash".to_string(),
|
||||
"-c".to_string(),
|
||||
command.to_string(),
|
||||
],
|
||||
volumes: vec![host_vol],
|
||||
volume_mounts: vec![host_mount],
|
||||
host_pid: true,
|
||||
host_network: true,
|
||||
},
|
||||
&self.get_k8s_distribution().await?,
|
||||
);
|
||||
|
||||
bundle.apply(self).await?;
|
||||
debug!("Privileged pod '{name}' created");
|
||||
|
||||
let result = self.wait_for_pod_completion(&name, namespace).await;
|
||||
|
||||
debug!("Cleaning up '{name}'");
|
||||
let _ = bundle.delete(self).await;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn base_pod(name: &str, ns: &str) -> Pod {
|
||||
Pod {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.to_string()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(PodSpec::default()),
|
||||
status: Some(PodStatus {
|
||||
phase: Some("Running".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn mirror_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
|
||||
"kubernetes.io/config.mirror".to_string(),
|
||||
"abc123".to_string(),
|
||||
)]));
|
||||
pod
|
||||
}
|
||||
|
||||
fn daemonset_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.metadata.owner_references = Some(vec![OwnerReference {
|
||||
api_version: "apps/v1".to_string(),
|
||||
kind: "DaemonSet".to_string(),
|
||||
name: "some-ds".to_string(),
|
||||
uid: "uid-ds".to_string(),
|
||||
..Default::default()
|
||||
}]);
|
||||
pod
|
||||
}
|
||||
|
||||
fn emptydir_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.spec = Some(PodSpec {
|
||||
volumes: Some(vec![Volume {
|
||||
name: "scratch".to_string(),
|
||||
empty_dir: Some(EmptyDirVolumeSource::default()),
|
||||
..Default::default()
|
||||
}]),
|
||||
..Default::default()
|
||||
});
|
||||
pod
|
||||
}
|
||||
|
||||
fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some(phase.to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
pod
|
||||
}
|
||||
|
||||
fn default_opts() -> DrainOptions {
|
||||
DrainOptions::default()
|
||||
}
|
||||
|
||||
// All test bodies are identical to the original — only the module path changed.
|
||||
|
||||
#[test]
|
||||
fn empty_pod_list_returns_empty_vecs() {
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normal_pod_is_evictable() {
|
||||
let pods = vec![base_pod("web", "default")];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert_eq!(e.len(), 1);
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mirror_pod_is_skipped() {
|
||||
let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("mirror pod"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn completed_pods_are_skipped() {
|
||||
for phase in ["Succeeded", "Failed"] {
|
||||
let pods = vec![completed_pod("job", "batch", phase)];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("completed"));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn daemonset_skipped_when_ignored() {
|
||||
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: true,
|
||||
..default_opts()
|
||||
};
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("DaemonSet-managed"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn daemonset_blocks_when_not_ignored() {
|
||||
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emptydir_blocks_without_flag() {
|
||||
let pods = vec![emptydir_pod("cache", "default")];
|
||||
let opts = DrainOptions {
|
||||
delete_emptydir_data: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("emptyDir") && err.contains("default/cache"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emptydir_evictable_with_flag() {
|
||||
let pods = vec![emptydir_pod("cache", "default")];
|
||||
let opts = DrainOptions {
|
||||
delete_emptydir_data: true,
|
||||
..default_opts()
|
||||
};
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||
assert_eq!(e.len(), 1);
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_blocking_all_reported() {
|
||||
let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: false,
|
||||
delete_emptydir_data: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_pods_classified_correctly() {
|
||||
let pods = vec![
|
||||
base_pod("web", "default"),
|
||||
mirror_pod("kube-apiserver", "kube-system"),
|
||||
daemonset_pod("fluentd", "logging"),
|
||||
completed_pod("job", "batch", "Succeeded"),
|
||||
base_pod("api", "default"),
|
||||
];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
let names: Vec<&str> = e
|
||||
.iter()
|
||||
.map(|p| p.metadata.name.as_deref().unwrap())
|
||||
.collect();
|
||||
assert_eq!(names, vec!["web", "api"]);
|
||||
assert_eq!(s.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mirror_checked_before_completed() {
|
||||
let mut pod = mirror_pod("static-etcd", "kube-system");
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some("Succeeded".to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||
assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn completed_checked_before_daemonset() {
|
||||
let mut pod = daemonset_pod("collector", "monitoring");
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some("Failed".to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||
assert!(s[0].contains("completed"), "got: {}", s[0]);
|
||||
}
|
||||
}
|
||||
193
harmony-k8s/src/pod.rs
Normal file
193
harmony-k8s/src/pod.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use k8s_openapi::api::core::v1::Pod;
|
||||
use kube::{
|
||||
Error,
|
||||
api::{Api, AttachParams, ListParams},
|
||||
error::DiscoveryError,
|
||||
runtime::reflector::Lookup,
|
||||
};
|
||||
use log::debug;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn wait_for_pod_ready(
|
||||
&self,
|
||||
pod_name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
let mut elapsed = 0u64;
|
||||
let interval = 5u64;
|
||||
let timeout_secs = 120u64;
|
||||
loop {
|
||||
if let Some(p) = self.get_pod(pod_name, namespace).await? {
|
||||
if let Some(phase) = p.status.and_then(|s| s.phase) {
|
||||
if phase.to_lowercase() == "running" {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
if elapsed >= timeout_secs {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Pod '{}' in '{}' did not become ready within {timeout_secs}s",
|
||||
pod_name,
|
||||
namespace.unwrap_or("<default>"),
|
||||
))));
|
||||
}
|
||||
sleep(Duration::from_secs(interval)).await;
|
||||
elapsed += interval;
|
||||
}
|
||||
}
|
||||
|
||||
/// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
|
||||
/// logs. Used internally by node operations.
|
||||
pub(crate) async fn wait_for_pod_completion(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: &str,
|
||||
) -> Result<String, Error> {
|
||||
let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
|
||||
let poll_interval = Duration::from_secs(2);
|
||||
for _ in 0..60 {
|
||||
sleep(poll_interval).await;
|
||||
let p = api.get(name).await?;
|
||||
match p.status.and_then(|s| s.phase).as_deref() {
|
||||
Some("Succeeded") => {
|
||||
let logs = api
|
||||
.logs(name, &Default::default())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
|
||||
return Ok(logs);
|
||||
}
|
||||
Some("Failed") => {
|
||||
let logs = api
|
||||
.logs(name, &Default::default())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
debug!("Pod {namespace}/{name} failed. Logs: {logs}");
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Pod '{name}' failed.\n{logs}"
|
||||
))));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timed out waiting for pod '{name}'"
|
||||
))))
|
||||
}
|
||||
|
||||
/// Execute a command in the first pod matching `{label}={name}`.
|
||||
pub async fn exec_app_capture_output(
|
||||
&self,
|
||||
name: String,
|
||||
label: String,
|
||||
namespace: Option<&str>,
|
||||
command: Vec<&str>,
|
||||
) -> Result<String, String> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let pod_list = api
|
||||
.list(&ListParams::default().labels(&format!("{label}={name}")))
|
||||
.await
|
||||
.expect("Failed to list pods");
|
||||
|
||||
let pod_name = pod_list
|
||||
.items
|
||||
.first()
|
||||
.expect("No matching pod")
|
||||
.name()
|
||||
.expect("Pod has no name")
|
||||
.into_owned();
|
||||
|
||||
match api
|
||||
.exec(
|
||||
&pod_name,
|
||||
command,
|
||||
&AttachParams::default().stdout(true).stderr(true),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => Err(e.to_string()),
|
||||
Ok(mut process) => {
|
||||
let status = process
|
||||
.take_status()
|
||||
.expect("No status handle")
|
||||
.await
|
||||
.expect("Status channel closed");
|
||||
|
||||
if let Some(s) = status.status {
|
||||
let mut buf = String::new();
|
||||
if let Some(mut stdout) = process.stdout() {
|
||||
stdout
|
||||
.read_to_string(&mut buf)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to read stdout: {e}"))?;
|
||||
}
|
||||
debug!("exec status: {} - {:?}", s, status.details);
|
||||
if s == "Success" { Ok(buf) } else { Err(s) }
|
||||
} else {
|
||||
Err("No inner status from pod exec".to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute a command in the first pod matching
|
||||
/// `app.kubernetes.io/name={name}`.
|
||||
pub async fn exec_app(
|
||||
&self,
|
||||
name: String,
|
||||
namespace: Option<&str>,
|
||||
command: Vec<&str>,
|
||||
) -> Result<(), String> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let pod_list = api
|
||||
.list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
|
||||
.await
|
||||
.expect("Failed to list pods");
|
||||
|
||||
let pod_name = pod_list
|
||||
.items
|
||||
.first()
|
||||
.expect("No matching pod")
|
||||
.name()
|
||||
.expect("Pod has no name")
|
||||
.into_owned();
|
||||
|
||||
match api.exec(&pod_name, command, &AttachParams::default()).await {
|
||||
Err(e) => Err(e.to_string()),
|
||||
Ok(mut process) => {
|
||||
let status = process
|
||||
.take_status()
|
||||
.expect("No status handle")
|
||||
.await
|
||||
.expect("Status channel closed");
|
||||
|
||||
if let Some(s) = status.status {
|
||||
debug!("exec status: {} - {:?}", s, status.details);
|
||||
if s == "Success" { Ok(()) } else { Err(s) }
|
||||
} else {
|
||||
Err("No inner status from pod exec".to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
316
harmony-k8s/src/resources.rs
Normal file
316
harmony-k8s/src/resources.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use k8s_openapi::api::{
|
||||
apps::v1::Deployment,
|
||||
core::v1::{Node, ServiceAccount},
|
||||
};
|
||||
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
|
||||
use kube::api::ApiResource;
|
||||
use kube::{
|
||||
Error, Resource,
|
||||
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
|
||||
runtime::conditions,
|
||||
runtime::wait::await_condition,
|
||||
};
|
||||
use log::debug;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::types::ScopeResolver;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn has_healthy_deployment_with_label(
|
||||
&self,
|
||||
namespace: &str,
|
||||
label_selector: &str,
|
||||
) -> Result<bool, Error> {
|
||||
let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
|
||||
let list = api
|
||||
.list(&ListParams::default().labels(label_selector))
|
||||
.await?;
|
||||
for d in list.items {
|
||||
let available = d
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.available_replicas)
|
||||
.unwrap_or(0);
|
||||
if available > 0 {
|
||||
return Ok(true);
|
||||
}
|
||||
if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
|
||||
if conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||
{
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub async fn list_namespaces_with_healthy_deployments(
|
||||
&self,
|
||||
label_selector: &str,
|
||||
) -> Result<Vec<String>, Error> {
|
||||
let api: Api<Deployment> = Api::all(self.client.clone());
|
||||
let list = api
|
||||
.list(&ListParams::default().labels(label_selector))
|
||||
.await?;
|
||||
|
||||
let mut healthy_ns: HashMap<String, bool> = HashMap::new();
|
||||
for d in list.items {
|
||||
let ns = match d.metadata.namespace.clone() {
|
||||
Some(n) => n,
|
||||
None => continue,
|
||||
};
|
||||
let available = d
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.available_replicas)
|
||||
.unwrap_or(0);
|
||||
let is_healthy = if available > 0 {
|
||||
true
|
||||
} else {
|
||||
d.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|c| {
|
||||
c.iter()
|
||||
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
};
|
||||
if is_healthy {
|
||||
healthy_ns.insert(ns, true);
|
||||
}
|
||||
}
|
||||
Ok(healthy_ns.into_keys().collect())
|
||||
}
|
||||
|
||||
pub async fn get_controller_service_account_name(
|
||||
&self,
|
||||
ns: &str,
|
||||
) -> Result<Option<String>, Error> {
|
||||
let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
|
||||
let list = api
|
||||
.list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
|
||||
.await?;
|
||||
if let Some(dep) = list.items.first() {
|
||||
if let Some(sa) = dep
|
||||
.spec
|
||||
.as_ref()
|
||||
.and_then(|s| s.template.spec.as_ref())
|
||||
.and_then(|s| s.service_account_name.clone())
|
||||
{
|
||||
return Ok(Some(sa));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
|
||||
let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
|
||||
let ar = ApiResource::from_gvk(&gvk);
|
||||
let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
|
||||
let list = api.list(&ListParams::default()).await?;
|
||||
Ok(list
|
||||
.items
|
||||
.into_iter()
|
||||
.map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
|
||||
let sa_user = format!("system:serviceaccount:{ns}:{sa}");
|
||||
for crb in self.list_clusterrolebindings_json().await? {
|
||||
if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
|
||||
for subj in subjects {
|
||||
let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
|
||||
|| (kind == "User" && name == sa_user)
|
||||
{
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
|
||||
let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
|
||||
let crds = api
|
||||
.list(&ListParams::default().fields(&format!("metadata.name={name}")))
|
||||
.await?;
|
||||
Ok(!crds.items.is_empty())
|
||||
}
|
||||
|
||||
pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
|
||||
Api::namespaced(self.client.clone(), namespace)
|
||||
}
|
||||
|
||||
pub async fn get_resource_json_value(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
gvk: &GroupVersionKind,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
let ar = ApiResource::from_gvk(gvk);
|
||||
let api: Api<DynamicObject> = match namespace {
|
||||
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||
};
|
||||
api.get(name).await
|
||||
}
|
||||
|
||||
pub async fn get_secret_json_value(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
self.get_resource_json_value(
|
||||
name,
|
||||
namespace,
|
||||
&GroupVersionKind {
|
||||
group: String::new(),
|
||||
version: "v1".to_string(),
|
||||
kind: "Secret".to_string(),
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn get_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<Option<Deployment>, Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => {
|
||||
debug!("Getting namespaced deployment '{name}' in '{ns}'");
|
||||
Api::namespaced(self.client.clone(), ns)
|
||||
}
|
||||
None => {
|
||||
debug!("Getting deployment '{name}' in default namespace");
|
||||
Api::default_namespaced(self.client.clone())
|
||||
}
|
||||
};
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn scale_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
replicas: u32,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
use kube::api::{Patch, PatchParams};
|
||||
use serde_json::json;
|
||||
let patch = json!({ "spec": { "replicas": replicas } });
|
||||
api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn delete_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
api.delete(name, &kube::api::DeleteParams::default())
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn wait_until_deployment_ready(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), String> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let timeout = timeout.unwrap_or(Duration::from_secs(120));
|
||||
let establish = await_condition(api, name, conditions::is_deployment_completed());
|
||||
tokio::time::timeout(timeout, establish)
|
||||
.await
|
||||
.map(|_| ())
|
||||
.map_err(|_| "Timed out waiting for deployment".to_string())
|
||||
}
|
||||
|
||||
/// Gets a single named resource, using the correct API scope for `K`.
|
||||
pub async fn get_resource<K>(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<Option<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let api: Api<K> =
|
||||
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn list_resources<K>(
|
||||
&self,
|
||||
namespace: Option<&str>,
|
||||
list_params: Option<ListParams>,
|
||||
) -> Result<ObjectList<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let api: Api<K> =
|
||||
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||
api.list(&list_params.unwrap_or_default()).await
|
||||
}
|
||||
|
||||
pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
Api::<K>::all(self.client.clone())
|
||||
.list(&ListParams::default().labels(labels))
|
||||
.await
|
||||
.map(|l| l.items)
|
||||
}
|
||||
|
||||
pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
Api::<K>::all(self.client.clone())
|
||||
.list(&Default::default())
|
||||
.await
|
||||
.map(|l| l.items)
|
||||
}
|
||||
|
||||
pub async fn get_nodes(
|
||||
&self,
|
||||
list_params: Option<ListParams>,
|
||||
) -> Result<ObjectList<Node>, Error> {
|
||||
self.list_resources(None, list_params).await
|
||||
}
|
||||
}
|
||||
100
harmony-k8s/src/types.rs
Normal file
100
harmony-k8s/src/types.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
|
||||
use kube::{Api, Client, Resource};
|
||||
use serde::Serialize;
|
||||
|
||||
/// Which Kubernetes distribution is running. Detected once at runtime via
|
||||
/// [`crate::discovery::K8sClient::get_k8s_distribution`].
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub enum KubernetesDistribution {
|
||||
Default,
|
||||
OpenshiftFamily,
|
||||
K3sFamily,
|
||||
}
|
||||
|
||||
/// A file to be written to a node's filesystem.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeFile {
|
||||
/// Absolute path on the host where the file should be written.
|
||||
pub path: String,
|
||||
/// Content of the file.
|
||||
pub content: String,
|
||||
/// UNIX permissions (e.g. `0o600`).
|
||||
pub mode: u32,
|
||||
}
|
||||
|
||||
/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DrainOptions {
|
||||
/// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
|
||||
/// Equivalent to `kubectl drain --delete-emptydir-data`.
|
||||
pub delete_emptydir_data: bool,
|
||||
/// Silently skip DaemonSet-managed pods instead of blocking the drain.
|
||||
/// Equivalent to `kubectl drain --ignore-daemonsets`.
|
||||
pub ignore_daemonsets: bool,
|
||||
/// Maximum wall-clock time to wait for all evictions to complete.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for DrainOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
delete_emptydir_data: false,
|
||||
ignore_daemonsets: true,
|
||||
timeout: Duration::from_secs(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DrainOptions {
|
||||
pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
|
||||
Self {
|
||||
delete_emptydir_data: true,
|
||||
ignore_daemonsets: true,
|
||||
..Self::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
|
||||
/// resource already exists (or does not).
|
||||
pub enum WriteMode {
|
||||
/// Server-side apply; create if absent, update if present (default).
|
||||
CreateOrUpdate,
|
||||
/// POST only; return an error if the resource already exists.
|
||||
Create,
|
||||
/// Server-side apply only; return an error if the resource does not exist.
|
||||
Update,
|
||||
}
|
||||
|
||||
// ── Scope resolution trait ───────────────────────────────────────────────────
|
||||
|
||||
/// Resolves the correct [`kube::Api`] for a resource type based on its scope
|
||||
/// (cluster-wide vs. namespace-scoped).
|
||||
pub trait ScopeResolver<K: Resource> {
|
||||
fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
|
||||
}
|
||||
|
||||
impl<K> ScopeResolver<K> for ClusterResourceScope
|
||||
where
|
||||
K: Resource<Scope = ClusterResourceScope>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
|
||||
Api::all(client.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> ScopeResolver<K> for NamespaceResourceScope
|
||||
where
|
||||
K: Resource<Scope = NamespaceResourceScope>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
|
||||
match ns {
|
||||
Some(ns) => Api::namespaced(client.clone(), ns),
|
||||
None => Api::default_namespaced(client.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,8 @@ semver = "1.0.23"
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-retry.workspace = true
|
||||
tokio-util.workspace = true
|
||||
derive-new.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
@@ -31,6 +33,7 @@ opnsense-config-xml = { path = "../opnsense-config-xml" }
|
||||
harmony_macros = { path = "../harmony_macros" }
|
||||
harmony_types = { path = "../harmony_types" }
|
||||
harmony_execution = { path = "../harmony_execution" }
|
||||
harmony-k8s = { path = "../harmony-k8s" }
|
||||
uuid.workspace = true
|
||||
url.workspace = true
|
||||
kube = { workspace = true, features = ["derive"] }
|
||||
@@ -60,7 +63,6 @@ temp-dir = "0.1.14"
|
||||
dyn-clone = "1.0.19"
|
||||
similar.workspace = true
|
||||
futures-util = "0.3.31"
|
||||
tokio-util = "0.7.15"
|
||||
strum = { version = "0.27.1", features = ["derive"] }
|
||||
tempfile.workspace = true
|
||||
serde_with = "3.14.0"
|
||||
@@ -80,7 +82,7 @@ sqlx.workspace = true
|
||||
inquire.workspace = true
|
||||
brocade = { path = "../brocade" }
|
||||
option-ext = "0.2.0"
|
||||
tokio-retry = "0.3.0"
|
||||
rand.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions.workspace = true
|
||||
|
||||
@@ -4,8 +4,6 @@ use std::error::Error;
|
||||
use async_trait::async_trait;
|
||||
use derive_new::new;
|
||||
|
||||
use crate::inventory::HostRole;
|
||||
|
||||
use super::{
|
||||
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
|
||||
};
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::{
|
||||
id::Id,
|
||||
@@ -16,7 +17,7 @@ use super::{
|
||||
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
|
||||
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
|
||||
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
|
||||
SwitchError, TftpServer, Topology, k8s::K8sClient,
|
||||
SwitchError, TftpServer, Topology,
|
||||
};
|
||||
use std::{
|
||||
process::Command,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,7 @@ use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use base64::{Engine, engine::general_purpose};
|
||||
use harmony_k8s::{K8sClient, KubernetesDistribution};
|
||||
use harmony_types::rfc1123::Rfc1123Name;
|
||||
use k8s_openapi::api::{
|
||||
core::v1::{Pod, Secret},
|
||||
@@ -58,7 +59,6 @@ use crate::{
|
||||
use super::super::{
|
||||
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
|
||||
PreparationOutcome, Topology,
|
||||
k8s::K8sClient,
|
||||
oberservability::monitoring::AlertReceiver,
|
||||
tenant::{
|
||||
TenantConfig, TenantManager,
|
||||
@@ -76,13 +76,6 @@ struct K8sState {
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub enum KubernetesDistribution {
|
||||
OpenshiftFamily,
|
||||
K3sFamily,
|
||||
Default,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum K8sSource {
|
||||
LocalK3d,
|
||||
|
||||
@@ -106,6 +106,7 @@ pub enum SSL {
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
pub enum HealthCheck {
|
||||
HTTP(String, HttpMethod, HttpStatusCode, SSL),
|
||||
/// HTTP(None, "/healthz/ready", HttpMethod::GET, HttpStatusCode::Success2xx, SSL::Disabled)
|
||||
HTTP(Option<u16>, String, HttpMethod, HttpStatusCode, SSL),
|
||||
TCP(Option<u16>),
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ pub mod tenant;
|
||||
use derive_new::new;
|
||||
pub use k8s_anywhere::*;
|
||||
pub use localhost::*;
|
||||
pub mod k8s;
|
||||
mod load_balancer;
|
||||
pub mod router;
|
||||
mod tftp;
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::{
|
||||
use async_trait::async_trait;
|
||||
use brocade::PortOperatingMode;
|
||||
use derive_new::new;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::{
|
||||
id::Id,
|
||||
net::{IpAddress, MacAddress},
|
||||
@@ -18,7 +19,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::executors::ExecutorError;
|
||||
|
||||
use super::{LogicalHost, k8s::K8sClient};
|
||||
use super::LogicalHost;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DHCPStaticEntry {
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
executors::ExecutorError,
|
||||
topology::k8s::{ApplyStrategy, K8sClient},
|
||||
};
|
||||
use crate::executors::ExecutorError;
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use k8s_openapi::{
|
||||
api::{
|
||||
core::v1::{LimitRange, Namespace, ResourceQuota},
|
||||
@@ -14,7 +12,7 @@ use k8s_openapi::{
|
||||
},
|
||||
apimachinery::pkg::util::intstr::IntOrString,
|
||||
};
|
||||
use kube::{Resource, api::DynamicObject};
|
||||
use kube::Resource;
|
||||
use log::debug;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::json;
|
||||
@@ -59,7 +57,6 @@ impl K8sTenantManager {
|
||||
) -> Result<K, ExecutorError>
|
||||
where
|
||||
<K as kube::Resource>::DynamicType: Default,
|
||||
<K as kube::Resource>::Scope: ApplyStrategy<K>,
|
||||
{
|
||||
self.apply_labels(&mut resource, config);
|
||||
self.k8s_client
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{
|
||||
|
||||
use askama::Template;
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::{DrainOptions, K8sClient, NodeFile};
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::Node;
|
||||
use kube::{
|
||||
@@ -15,10 +16,7 @@ use log::{debug, info, warn};
|
||||
|
||||
use crate::{
|
||||
modules::okd::crd::nmstate,
|
||||
topology::{
|
||||
HostNetworkConfig, NetworkError, NetworkManager,
|
||||
k8s::{DrainOptions, K8sClient, NodeFile},
|
||||
},
|
||||
topology::{HostNetworkConfig, NetworkError, NetworkManager},
|
||||
};
|
||||
|
||||
/// NetworkManager bond configuration template
|
||||
|
||||
@@ -216,7 +216,15 @@ pub(crate) fn get_health_check_for_backend(
|
||||
SSL::Other(other.to_string())
|
||||
}
|
||||
};
|
||||
Some(HealthCheck::HTTP(path, method, status_code, ssl))
|
||||
|
||||
let port = haproxy_health_check
|
||||
.checkport
|
||||
.content_string()
|
||||
.parse::<u16>()
|
||||
.ok();
|
||||
debug!("Found haproxy healthcheck port {port:?}");
|
||||
|
||||
Some(HealthCheck::HTTP(port, path, method, status_code, ssl))
|
||||
}
|
||||
_ => panic!("Received unsupported health check type {}", uppercase),
|
||||
}
|
||||
@@ -251,7 +259,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
// frontend points to backend
|
||||
let healthcheck = if let Some(health_check) = &service.health_check {
|
||||
match health_check {
|
||||
HealthCheck::HTTP(path, http_method, _http_status_code, ssl) => {
|
||||
HealthCheck::HTTP(port, path, http_method, _http_status_code, ssl) => {
|
||||
let ssl: MaybeString = match ssl {
|
||||
SSL::SSL => "ssl".into(),
|
||||
SSL::SNI => "sslni".into(),
|
||||
@@ -267,6 +275,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
http_uri: path.clone().into(),
|
||||
interval: "2s".to_string(),
|
||||
ssl,
|
||||
checkport: MaybeString::from(port.map(|p| p.to_string())),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, info, trace};
|
||||
use log::{debug, info};
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_macros::hurl;
|
||||
use log::{debug, info, trace, warn};
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
@@ -14,7 +15,7 @@ use crate::{
|
||||
helm::chart::{HelmChartScore, HelmRepository},
|
||||
},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress, k8s::K8sClient},
|
||||
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use harmony_k8s::K8sClient;
|
||||
use log::{debug, info};
|
||||
|
||||
use crate::{interpret::InterpretError, topology::k8s::K8sClient};
|
||||
use crate::interpret::InterpretError;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub enum ArgoScope {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use harmony_k8s::K8sClient;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -11,7 +12,7 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
||||
@@ -3,7 +3,8 @@ use std::sync::Arc;
|
||||
use async_trait::async_trait;
|
||||
use log::warn;
|
||||
|
||||
use crate::topology::{FailoverTopology, K8sclient, k8s::K8sClient};
|
||||
use crate::topology::{FailoverTopology, K8sclient};
|
||||
use harmony_k8s::K8sClient;
|
||||
|
||||
#[async_trait]
|
||||
impl<T: K8sclient> K8sclient for FailoverTopology<T> {
|
||||
|
||||
@@ -109,7 +109,7 @@ where
|
||||
topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
|
||||
.apply_many(&self.score.resource, self.score.namespace.as_deref())
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access
|
||||
rules:
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- prometheuses/api
|
||||
verbs:
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-prometheus-api-access
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-cluster-monitoring-view
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-monitoring-view
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: observability
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
serviceAccountName: cluster-grafana-sa
|
||||
automountServiceAccountToken: true
|
||||
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
|
||||
security:
|
||||
admin_user: admin
|
||||
admin_password: paul
|
||||
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
|
||||
auth.anonymous:
|
||||
enabled: "true"
|
||||
org_role: Viewer
|
||||
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-prometheus-token
|
||||
namespace: observability
|
||||
annotations:
|
||||
kubernetes.io/service-account.name: cluster-grafana-sa
|
||||
type: kubernetes.io/service-account-token
|
||||
@@ -0,0 +1,27 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "Bearer ${token}"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: observability
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: cluster-grafana-service
|
||||
port:
|
||||
targetPort: 3000
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
@@ -0,0 +1,97 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster CPU Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster Memory Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||||
}
|
||||
]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,769 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,637 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-node-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,783 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-workload-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,955 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-networking
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,607 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: storage-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,744 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-etcd
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,752 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-control-plane-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,741 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-alerts-events
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
# These are probably already created by rook-ceph operator, not sure, needs to validate.
|
||||
# in fact, 100% sure for the second one (rook-ceph-exporter)
|
||||
# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working
|
||||
# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-mgr
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This specific label is what tells OKD's Prometheus to pick this up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
# This matches your 'rook-ceph-mgr' service
|
||||
app: rook-ceph-mgr
|
||||
endpoints:
|
||||
- port: ""
|
||||
# The port name in your service is empty/integers, so we use targetPort
|
||||
targetPort: 9283
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-exporter
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This label is required for OKD cluster-wide monitoring to pick it up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
team: rook
|
||||
spec:
|
||||
endpoints:
|
||||
- honorLabels: true
|
||||
interval: 10s
|
||||
path: /metrics
|
||||
port: ceph-exporter-http-metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rook-ceph-exporter
|
||||
rook_cluster: rook-ceph
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints", "pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: rook-ceph-metrics-viewer
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: openshift-monitoring
|
||||
@@ -0,0 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: rook-ceph
|
||||
labels:
|
||||
# This is the critical label that allows OKD Prometheus to see the namespace
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -0,0 +1,731 @@
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,739 @@
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,742 @@
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,734 @@
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,945 @@
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,627 @@
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,596 @@
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,773 @@
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
2
harmony/src/modules/monitoring/cluster_dashboards/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
mod score;
|
||||
pub use score::ClusterDashboardsScore;
|
||||
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
507
harmony/src/modules/monitoring/cluster_dashboards/score.rs
Normal file
@@ -0,0 +1,507 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::{Namespace, Secret};
|
||||
use kube::{api::ObjectMeta, api::DynamicObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_yaml;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
modules::okd::crd::route::Route,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ClusterDashboardsScore {
|
||||
pub namespace: String,
|
||||
pub grafana_admin_user: String,
|
||||
pub grafana_admin_password: String,
|
||||
}
|
||||
|
||||
impl Default for ClusterDashboardsScore {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
namespace: "harmony-observability".to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsScore {
|
||||
pub fn new(namespace: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: "admin".to_string(),
|
||||
grafana_admin_password: "password".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_credentials(namespace: &str, admin_user: &str, admin_password: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
grafana_admin_user: admin_user.to_string(),
|
||||
grafana_admin_password: admin_password.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for ClusterDashboardsScore {
|
||||
fn name(&self) -> String {
|
||||
format!("ClusterDashboardsScore({})", self.namespace)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(ClusterDashboardsInterpret {
|
||||
namespace: self.namespace.clone(),
|
||||
grafana_admin_user: self.grafana_admin_user.clone(),
|
||||
grafana_admin_password: self.grafana_admin_password.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterDashboardsInterpret {
|
||||
namespace: String,
|
||||
grafana_admin_user: String,
|
||||
grafana_admin_password: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for ClusterDashboardsInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
self.create_namespace(inventory, topology).await?;
|
||||
self.create_rbac_resources(inventory, topology).await?;
|
||||
self.create_secret(inventory, topology).await?;
|
||||
self.create_grafana(inventory, topology).await?;
|
||||
self.create_datasource(inventory, topology).await?;
|
||||
self.create_dashboards(inventory, topology).await?;
|
||||
self.create_route(inventory, topology).await?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Cluster dashboards resources in namespace '{}' with {} dashboards successfully created",
|
||||
self.namespace,
|
||||
8
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusterDashboardsInterpret {
|
||||
async fn create_namespace(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert(
|
||||
"openshift.io/cluster-monitoring".to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
|
||||
let namespace = Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.namespace.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..Namespace::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(namespace, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_rbac_resources(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "cluster-grafana-sa".to_string();
|
||||
let rbac_namespace = self.namespace.clone();
|
||||
|
||||
let service_account = {
|
||||
use k8s_openapi::api::core::v1::ServiceAccount;
|
||||
ServiceAccount {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(service_account_name.clone()),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..ServiceAccount::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRole, PolicyRule};
|
||||
ClusterRole {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["monitoring.coreos.com".to_string()]),
|
||||
resources: Some(vec!["prometheuses/api".to_string()]),
|
||||
verbs: vec!["get".to_string()],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..ClusterRole::default()
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-prometheus-api-access-binding".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "grafana-prometheus-api-access".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let cluster_role_binding_cluster_monitoring = {
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana-cluster-monitoring-view".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(rbac_namespace.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "cluster-monitoring-view".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
K8sResourceScore::single(service_account, Some(rbac_namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding_cluster_monitoring, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_secret(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let service_account_name = "cluster-grafana-sa".to_string();
|
||||
let secret_name = "grafana-prometheus-token".to_string();
|
||||
let secret_namespace = self.namespace.clone();
|
||||
|
||||
let secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(secret_name),
|
||||
namespace: Some(secret_namespace),
|
||||
annotations: Some({
|
||||
let mut ann = BTreeMap::new();
|
||||
ann.insert(
|
||||
"kubernetes.io/service-account.name".to_string(),
|
||||
service_account_name,
|
||||
);
|
||||
ann
|
||||
}),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
type_: Some("kubernetes.io/service-account-token".to_string()),
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(secret, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_grafana(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> = vec![
|
||||
("dashboards".to_string(), "grafana".to_string()),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let mut annotations = BTreeMap::new();
|
||||
annotations.insert(
|
||||
"kubectl.kubernetes.io/last-applied-configuration".to_string(),
|
||||
"".to_string(),
|
||||
);
|
||||
|
||||
let grafana_yaml = format!(r#"
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: {}
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
security:
|
||||
admin_user: {}
|
||||
admin_password: {}
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
"auth.anonymous":
|
||||
enabled: "true"
|
||||
org_role: "Viewer"
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 2Gi
|
||||
"#, self.namespace, self.grafana_admin_user, self.grafana_admin_password);
|
||||
|
||||
let grafana_value: serde_json::Value = serde_yaml::from_str(grafana_yaml.as_str())
|
||||
.map_err(|e| InterpretError::new(format!("Failed to parse Grafana YAML: {e}")))?;
|
||||
|
||||
let grafana: DynamicObject = serde_json::from_value(grafana_value)
|
||||
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||
|
||||
client.apply_dynamic(&grafana, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Grafana: {e}")))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_datasource(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let labels: BTreeMap<String, String> = vec![
|
||||
("datasource".to_string(), "prometheus".to_string()),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let secure_json_data_value = "Bearer ${token}";
|
||||
|
||||
let datasource_yaml = format!(r#"
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: {}
|
||||
labels:
|
||||
datasource: "prometheus"
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "{}"
|
||||
"#, self.namespace, secure_json_data_value);
|
||||
|
||||
let datasource_value: serde_json::Value = serde_yaml::from_str(datasource_yaml.as_str())
|
||||
.map_err(|e| InterpretError::new(format!("Failed to parse Datasource YAML: {e}")))?;
|
||||
|
||||
let datasource: DynamicObject = serde_json::from_value(datasource_value)
|
||||
.map_err(|e| InterpretError::new(format!("Failed to create DynamicObject: {e}")))?;
|
||||
|
||||
client.apply_dynamic(&datasource, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Datasource: {e}")))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_dashboards(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let client = topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client: {e}")))?;
|
||||
|
||||
let dashboards: &[(&str, &str)] = &[
|
||||
("okd-cluster-overview", include_str!("dashboards/cluster-overview.json")),
|
||||
("okd-node-health", include_str!("dashboards/nodes-health.json")),
|
||||
("okd-workload-health", include_str!("dashboards/workloads-health.json")),
|
||||
("okd-networking", include_str!("dashboards/networking.json")),
|
||||
("storage-health", include_str!("dashboards/storage.json")),
|
||||
("okd-etcd", include_str!("dashboards/etcd.json")),
|
||||
("okd-control-plane", include_str!("dashboards/control-plane.json")),
|
||||
("okd-alerts-events", include_str!("dashboards/alerts-events-problems.json")),
|
||||
];
|
||||
|
||||
for (dashboard_name, json_content) in dashboards {
|
||||
let dashboard: DynamicObject = serde_json::from_value(serde_json::json!({
|
||||
"apiVersion": "grafana.integreatly.org/v1beta1",
|
||||
"kind": "GrafanaDashboard",
|
||||
"metadata": {
|
||||
"name": dashboard_name,
|
||||
"namespace": self.namespace,
|
||||
"labels": {
|
||||
"dashboard": dashboard_name
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"instanceSelector": {
|
||||
"matchLabels": {
|
||||
"dashboards": "grafana"
|
||||
}
|
||||
},
|
||||
"json": json_content
|
||||
}
|
||||
})).map_err(|e| InterpretError::new(format!("Failed to create Dashboard {} DynamicObject: {e}", dashboard_name)))?;
|
||||
|
||||
client.apply_dynamic(&dashboard, Some(&self.namespace), false).await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to apply Dashboard {}: {e}", dashboard_name)))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_route(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &(impl Topology + K8sclient),
|
||||
) -> Result<(), InterpretError> {
|
||||
let route = Route {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("grafana".to_string()),
|
||||
namespace: Some(self.namespace.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: crate::modules::okd::crd::route::RouteSpec {
|
||||
to: crate::modules::okd::crd::route::RouteTargetReference {
|
||||
kind: "Service".to_string(),
|
||||
name: "cluster-grafana-service".to_string(),
|
||||
weight: None,
|
||||
},
|
||||
port: Some(crate::modules::okd::crd::route::RoutePort {
|
||||
target_port: 3000,
|
||||
}),
|
||||
tls: Some(crate::modules::okd::crd::route::TLSConfig {
|
||||
termination: "edge".to_string(),
|
||||
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
||||
..crate::modules::okd::crd::route::TLSConfig::default()
|
||||
}),
|
||||
..crate::modules::okd::crd::route::RouteSpec::default()
|
||||
},
|
||||
..crate::modules::okd::crd::route::Route::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(route, Some(self.namespace.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("ClusterDashboards")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
interpret::InterpretError,
|
||||
inventory::Inventory,
|
||||
modules::{
|
||||
monitoring::{
|
||||
@@ -17,10 +17,10 @@ use crate::{
|
||||
topology::{
|
||||
K8sclient, Topology,
|
||||
installable::Installable,
|
||||
k8s::K8sClient,
|
||||
oberservability::monitoring::{AlertReceiver, AlertSender, ScrapeTarget},
|
||||
},
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[kube(
|
||||
|
||||
@@ -4,10 +4,8 @@ use kube::CustomResource;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::topology::{
|
||||
k8s::K8sClient,
|
||||
oberservability::monitoring::{AlertReceiver, AlertSender},
|
||||
};
|
||||
use crate::topology::oberservability::monitoring::{AlertReceiver, AlertSender};
|
||||
use harmony_k8s::K8sClient;
|
||||
|
||||
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
|
||||
#[kube(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod alert_channel;
|
||||
pub mod alert_rule;
|
||||
pub mod application_monitoring;
|
||||
pub mod cluster_dashboards;
|
||||
pub mod grafana;
|
||||
pub mod kube_prometheus;
|
||||
pub mod ntfy;
|
||||
|
||||
@@ -11,8 +11,9 @@ use crate::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::ntfy::helm::ntfy_helm_chart::ntfy_helm_chart_score,
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology, k8s::K8sClient},
|
||||
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology},
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
topology::k8s::K8sClient,
|
||||
};
|
||||
use crate::interpret::{InterpretError, Outcome};
|
||||
use harmony_k8s::K8sClient;
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use kube::api::ObjectMeta;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::{collections::BTreeMap, str::FromStr};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::KubernetesDistribution;
|
||||
use harmony_macros::hurl;
|
||||
use harmony_secret::{Secret, SecretManager};
|
||||
use harmony_types::id::Id;
|
||||
@@ -25,7 +26,7 @@ use crate::{
|
||||
},
|
||||
},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, KubernetesDistribution, TlsRouter, Topology},
|
||||
topology::{HelmCommand, K8sclient, TlsRouter, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
||||
@@ -41,6 +41,7 @@ impl OKDBootstrapLoadBalancerScore {
|
||||
backend_servers: Self::topology_to_backend_server(topology, 6443),
|
||||
listening_port: SocketAddr::new(private_ip, 6443),
|
||||
health_check: Some(HealthCheck::HTTP(
|
||||
None,
|
||||
"/readyz".to_string(),
|
||||
HttpMethod::GET,
|
||||
HttpStatusCode::Success2xx,
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::{
|
||||
score::Score,
|
||||
topology::{
|
||||
BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer,
|
||||
LoadBalancerService, SSL, Topology,
|
||||
LoadBalancerService, LogicalHost, Router, SSL, Topology,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -23,32 +23,72 @@ pub struct OKDLoadBalancerScore {
|
||||
load_balancer_score: LoadBalancerScore,
|
||||
}
|
||||
|
||||
/// OKD Load Balancer Score configuration
|
||||
///
|
||||
/// This module configures the load balancer for OKD (OpenShift Kubernetes Distribution)
|
||||
/// bare metal installations.
|
||||
///
|
||||
/// # Backend Server Configuration
|
||||
///
|
||||
/// For ports 80 and 443 (ingress traffic), the load balancer includes both control plane
|
||||
/// and worker nodes in the backend pool. This is consistent with OKD's requirement that
|
||||
/// ingress traffic should be load balanced across all nodes that may run ingress router pods.
|
||||
///
|
||||
/// For ports 22623 (Ignition API) and 6443 (Kubernetes API), only control plane nodes
|
||||
/// are included as backends, as these services are control plane specific.
|
||||
///
|
||||
/// # References
|
||||
///
|
||||
/// - [OKD Bare Metal Installation - External Load Balancer Configuration]
|
||||
/// (<https://docs.okd.io/latest/installing/installing_bare_metal/ipi/ipi-install-installation-workflow.html#nw-osp-configuring-external-load-balancer_ipi-install-installation-workflow>)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use harmony::topology::HAClusterTopology;
|
||||
/// use harmony::modules::okd::OKDLoadBalancerScore;
|
||||
///
|
||||
/// let topology: HAClusterTopology = /* get topology from your infrastructure */;
|
||||
/// let score = OKDLoadBalancerScore::new(&topology);
|
||||
/// ```
|
||||
impl OKDLoadBalancerScore {
|
||||
pub fn new(topology: &HAClusterTopology) -> Self {
|
||||
let public_ip = topology.router.get_gateway();
|
||||
let public_services = vec![
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||
listening_port: SocketAddr::new(public_ip, 80),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
health_check: None,
|
||||
},
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||
listening_port: SocketAddr::new(public_ip, 443),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
health_check: None,
|
||||
},
|
||||
];
|
||||
|
||||
let private_services = vec![
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||
listening_port: SocketAddr::new(public_ip, 80),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
health_check: Some(HealthCheck::HTTP(
|
||||
Some(25001),
|
||||
"/health?check=okd_router_1936,node_ready".to_string(),
|
||||
HttpMethod::GET,
|
||||
HttpStatusCode::Success2xx,
|
||||
SSL::Default,
|
||||
)),
|
||||
},
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||
listening_port: SocketAddr::new(public_ip, 443),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
health_check: Some(HealthCheck::HTTP(
|
||||
Some(25001),
|
||||
"/health?check=okd_router_1936,node_ready".to_string(),
|
||||
HttpMethod::GET,
|
||||
HttpStatusCode::Success2xx,
|
||||
SSL::Default,
|
||||
)),
|
||||
},
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 22623),
|
||||
@@ -59,6 +99,7 @@ impl OKDLoadBalancerScore {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 6443),
|
||||
listening_port: SocketAddr::new(public_ip, 6443),
|
||||
health_check: Some(HealthCheck::HTTP(
|
||||
None,
|
||||
"/readyz".to_string(),
|
||||
HttpMethod::GET,
|
||||
HttpStatusCode::Success2xx,
|
||||
@@ -74,6 +115,11 @@ impl OKDLoadBalancerScore {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates backend servers list for control plane nodes only
|
||||
///
|
||||
/// Use this for control plane-specific services like:
|
||||
/// - Port 22623: Ignition API (machine configuration during bootstrap)
|
||||
/// - Port 6443: Kubernetes API server
|
||||
fn control_plane_to_backend_server(
|
||||
topology: &HAClusterTopology,
|
||||
port: u16,
|
||||
@@ -87,6 +133,194 @@ impl OKDLoadBalancerScore {
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates backend servers list for all nodes (control plane + workers)
|
||||
///
|
||||
/// Use this for ingress traffic that should be distributed across all nodes:
|
||||
/// - Port 80: HTTP ingress traffic
|
||||
/// - Port 443: HTTPS ingress traffic
|
||||
///
|
||||
/// In OKD, ingress router pods can run on any node, so both control plane
|
||||
/// and worker nodes should be included in the load balancer backend pool.
|
||||
fn nodes_to_backend_server(topology: &HAClusterTopology, port: u16) -> Vec<BackendServer> {
|
||||
let mut nodes = Vec::new();
|
||||
for cp in &topology.control_plane {
|
||||
nodes.push(BackendServer {
|
||||
address: cp.ip.to_string(),
|
||||
port,
|
||||
});
|
||||
}
|
||||
for worker in &topology.workers {
|
||||
nodes.push(BackendServer {
|
||||
address: worker.ip.to_string(),
|
||||
port,
|
||||
});
|
||||
}
|
||||
nodes
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use super::*;
|
||||
use crate::topology::DummyInfra;
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::net::IpAddress;
|
||||
|
||||
fn create_test_topology() -> HAClusterTopology {
|
||||
let router = Arc::new(DummyRouter {
|
||||
gateway: ip!("192.168.1.1"),
|
||||
});
|
||||
|
||||
HAClusterTopology {
|
||||
domain_name: "test.example.com".to_string(),
|
||||
router,
|
||||
load_balancer: Arc::new(DummyInfra),
|
||||
firewall: Arc::new(DummyInfra),
|
||||
dhcp_server: Arc::new(DummyInfra),
|
||||
tftp_server: Arc::new(DummyInfra),
|
||||
http_server: Arc::new(DummyInfra),
|
||||
dns_server: Arc::new(DummyInfra),
|
||||
node_exporter: Arc::new(DummyInfra),
|
||||
switch_client: Arc::new(DummyInfra),
|
||||
bootstrap_host: LogicalHost {
|
||||
ip: ip!("192.168.1.100"),
|
||||
name: "bootstrap".to_string(),
|
||||
},
|
||||
control_plane: vec![
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.10"),
|
||||
name: "control-plane-0".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.11"),
|
||||
name: "control-plane-1".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.12"),
|
||||
name: "control-plane-2".to_string(),
|
||||
},
|
||||
],
|
||||
workers: vec![
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.20"),
|
||||
name: "worker-0".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.21"),
|
||||
name: "worker-1".to_string(),
|
||||
},
|
||||
],
|
||||
kubeconfig: None,
|
||||
network_manager: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct DummyRouter {
|
||||
gateway: IpAddress,
|
||||
}
|
||||
|
||||
impl Router for DummyRouter {
|
||||
fn get_gateway(&self) -> IpAddress {
|
||||
self.gateway
|
||||
}
|
||||
fn get_cidr(&self) -> cidr::Ipv4Cidr {
|
||||
let ipv4 = match self.gateway {
|
||||
IpAddress::V4(ip) => ip,
|
||||
IpAddress::V6(_) => panic!("IPv6 not supported"),
|
||||
};
|
||||
cidr::Ipv4Cidr::new(ipv4, 24).unwrap()
|
||||
}
|
||||
fn get_host(&self) -> LogicalHost {
|
||||
LogicalHost {
|
||||
ip: self.gateway,
|
||||
name: "router".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nodes_to_backend_server_includes_control_plane_and_workers() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 80);
|
||||
|
||||
assert_eq!(backend_servers.len(), 5);
|
||||
|
||||
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||
assert!(addresses.contains(&"192.168.1.10"));
|
||||
assert!(addresses.contains(&"192.168.1.11"));
|
||||
assert!(addresses.contains(&"192.168.1.12"));
|
||||
assert!(addresses.contains(&"192.168.1.20"));
|
||||
assert!(addresses.contains(&"192.168.1.21"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_control_plane_to_backend_server_only_includes_control_plane() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::control_plane_to_backend_server(&topology, 80);
|
||||
|
||||
assert_eq!(backend_servers.len(), 3);
|
||||
|
||||
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||
assert!(addresses.contains(&"192.168.1.10"));
|
||||
assert!(addresses.contains(&"192.168.1.11"));
|
||||
assert!(addresses.contains(&"192.168.1.12"));
|
||||
assert!(!addresses.contains(&"192.168.1.20"));
|
||||
assert!(!addresses.contains(&"192.168.1.21"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_public_services_include_all_nodes_on_port_80_and_443() {
|
||||
let topology = create_test_topology();
|
||||
let score = OKDLoadBalancerScore::new(&topology);
|
||||
|
||||
let public_service_80 = score
|
||||
.load_balancer_score
|
||||
.public_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 80)
|
||||
.expect("Public service on port 80 not found");
|
||||
|
||||
let public_service_443 = score
|
||||
.load_balancer_score
|
||||
.public_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 443)
|
||||
.expect("Public service on port 443 not found");
|
||||
|
||||
assert_eq!(public_service_80.backend_servers.len(), 5);
|
||||
assert_eq!(public_service_443.backend_servers.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_private_service_port_22623_only_control_plane() {
|
||||
let topology = create_test_topology();
|
||||
let score = OKDLoadBalancerScore::new(&topology);
|
||||
|
||||
let private_service_22623 = score
|
||||
.load_balancer_score
|
||||
.private_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 22623)
|
||||
.expect("Private service on port 22623 not found");
|
||||
|
||||
assert_eq!(private_service_22623.backend_servers.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_backend_servers_have_correct_port() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 443);
|
||||
|
||||
for server in backend_servers {
|
||||
assert_eq!(server.port, 443);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + LoadBalancer> Score<T> for OKDLoadBalancerScore {
|
||||
|
||||
@@ -15,6 +15,7 @@ use serde::{Deserialize, Serialize};
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ClusterSpec {
|
||||
pub instances: u32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_name: Option<String>,
|
||||
pub storage: Storage,
|
||||
pub bootstrap: Bootstrap,
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::topology::{K8sclient, Topology};
|
||||
/// # Usage
|
||||
/// ```
|
||||
/// use harmony::modules::postgresql::CloudNativePgOperatorScore;
|
||||
/// let score = CloudNativePgOperatorScore::default();
|
||||
/// let score = CloudNativePgOperatorScore::default_openshift();
|
||||
/// ```
|
||||
///
|
||||
/// Or, you can take control of most relevant fiedls this way :
|
||||
@@ -53,7 +53,7 @@ pub struct CloudNativePgOperatorScore {
|
||||
}
|
||||
|
||||
impl CloudNativePgOperatorScore {
|
||||
fn default_openshift() -> Self {
|
||||
pub fn default_openshift() -> Self {
|
||||
Self {
|
||||
namespace: "openshift-operators".to_string(),
|
||||
channel: "stable-v1".to_string(),
|
||||
|
||||
@@ -12,8 +12,7 @@ use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::C
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
|
||||
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
|
||||
GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSecretKeyRef, GrafanaSpec,
|
||||
GrafanaValueFrom, GrafanaValueSource,
|
||||
GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSpec,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||
@@ -23,7 +22,7 @@ use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
|
||||
ServiceMonitor, ServiceMonitorSpec,
|
||||
};
|
||||
use crate::topology::oberservability::monitoring::AlertReceiver;
|
||||
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
|
||||
use crate::topology::{K8sclient, Topology};
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
@@ -38,6 +37,7 @@ use crate::{
|
||||
},
|
||||
score::Score,
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
use super::prometheus::PrometheusMonitoring;
|
||||
|
||||
@@ -30,12 +30,13 @@ use crate::modules::monitoring::kube_prometheus::crd::rhob_service_monitor::{
|
||||
use crate::score::Score;
|
||||
use crate::topology::ingress::Ingress;
|
||||
use crate::topology::oberservability::monitoring::AlertReceiver;
|
||||
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
|
||||
use crate::topology::{K8sclient, Topology};
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
use super::prometheus::PrometheusMonitoring;
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::{
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use log::{debug, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::sleep;
|
||||
@@ -13,7 +14,7 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
|
||||
|
||||
@@ -9,8 +9,9 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use base64::{Engine, prelude::BASE64_STANDARD};
|
||||
use rand::{thread_rng, Rng};
|
||||
use rand::distributions::Alphanumeric;
|
||||
use k8s_openapi::api::core::v1::Namespace;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use k8s_openapi::{ByteString, api::core::v1::Secret};
|
||||
use kube::{Error as KubeError, core::ErrorResponse};
|
||||
use rand::distr::Distribution;
|
||||
use rand::{Rng, rng, seq::SliceRandom};
|
||||
use std::collections::BTreeMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -38,12 +38,17 @@ const MASTERKEY_SECRET_NAME: &str = "zitadel-masterkey";
|
||||
/// Photos, and others.
|
||||
///
|
||||
/// # Ingress annotations
|
||||
/// No controller-specific ingress annotations are set. The Zitadel service
|
||||
/// already carries the Traefik h2c annotation for k3s/k3d by default.
|
||||
/// Add annotations via `values_overrides` depending on your distribution:
|
||||
/// No controller-specific ingress annotations are set by default. On
|
||||
/// OKD/OpenShift, the ingress should request TLS so the generated Route is
|
||||
/// edge-terminated instead of HTTP-only. Optional cert-manager annotations are
|
||||
/// included for clusters that have cert-manager installed; clusters without
|
||||
/// cert-manager will ignore them.
|
||||
/// Add or adjust annotations via `values_overrides` depending on your
|
||||
/// distribution:
|
||||
/// - NGINX: `nginx.ingress.kubernetes.io/backend-protocol: GRPC`
|
||||
/// - OpenShift HAProxy: `haproxy.router.openshift.io/*` or use OpenShift Routes
|
||||
/// - OpenShift HAProxy: `route.openshift.io/termination: edge`
|
||||
/// - AWS ALB: set `ingress.controller: aws`
|
||||
|
||||
///
|
||||
/// # Database credentials
|
||||
/// CNPG creates a `<cluster>-superuser` secret with key `password`. Because
|
||||
@@ -57,6 +62,7 @@ const MASTERKEY_SECRET_NAME: &str = "zitadel-masterkey";
|
||||
pub struct ZitadelScore {
|
||||
/// External domain (e.g. `"auth.example.com"`).
|
||||
pub host: String,
|
||||
pub zitadel_version: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelScore {
|
||||
@@ -68,6 +74,7 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelSco
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(ZitadelInterpret {
|
||||
host: self.host.clone(),
|
||||
zitadel_version: self.zitadel_version.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -77,6 +84,7 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelSco
|
||||
#[derive(Debug, Clone)]
|
||||
struct ZitadelInterpret {
|
||||
host: String,
|
||||
zitadel_version: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -165,13 +173,55 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
|
||||
let db_port = endpoint.port;
|
||||
let host = &self.host;
|
||||
|
||||
debug!(
|
||||
"[Zitadel] DB credentials source — secret: '{pg_user_secret}', key: 'password'"
|
||||
);
|
||||
debug!("[Zitadel] DB credentials source — secret: '{pg_user_secret}', key: 'password'");
|
||||
debug!(
|
||||
"[Zitadel] DB credentials source — superuser secret: '{pg_superuser_secret}', key: 'password'"
|
||||
);
|
||||
|
||||
// Zitadel requires one symbol, one number and more. So let's force it.
|
||||
fn generate_secure_password(length: usize) -> String {
|
||||
const ALPHA_UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
const ALPHA_LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
|
||||
const DIGITS: &[u8] = b"0123456789";
|
||||
const SYMBOLS: &[u8] = b"!@#$%^&*()_+-=[]{}|;:',.<>?/";
|
||||
|
||||
let mut rng = rand::rng();
|
||||
let uniform_alpha_upper = rand::distr::Uniform::new(0, ALPHA_UPPER.len())
|
||||
.expect("Failed to create distribution");
|
||||
let uniform_alpha_lower = rand::distr::Uniform::new(0, ALPHA_LOWER.len())
|
||||
.expect("Failed to create distribution");
|
||||
let uniform_digits =
|
||||
rand::distr::Uniform::new(0, DIGITS.len()).expect("Failed to create distribution");
|
||||
let uniform_symbols =
|
||||
rand::distr::Uniform::new(0, SYMBOLS.len()).expect("Failed to create distribution");
|
||||
|
||||
let mut chars: Vec<char> = Vec::with_capacity(length);
|
||||
|
||||
// Ensure at least one of each: upper, lower, digit, symbol
|
||||
chars.push(ALPHA_UPPER[uniform_alpha_upper.sample(&mut rng)] as char);
|
||||
chars.push(ALPHA_LOWER[uniform_alpha_lower.sample(&mut rng)] as char);
|
||||
chars.push(DIGITS[uniform_digits.sample(&mut rng)] as char);
|
||||
chars.push(SYMBOLS[uniform_symbols.sample(&mut rng)] as char);
|
||||
|
||||
// Fill remaining with random from all categories
|
||||
let all_chars: Vec<u8> = [ALPHA_UPPER, ALPHA_LOWER, DIGITS, SYMBOLS].concat();
|
||||
|
||||
let uniform_all = rand::distr::Uniform::new(0, all_chars.len())
|
||||
.expect("Failed to create distribution");
|
||||
|
||||
for _ in 0..(length - 4) {
|
||||
chars.push(all_chars[uniform_all.sample(&mut rng)] as char);
|
||||
}
|
||||
|
||||
// Shuffle
|
||||
let mut shuffled = chars;
|
||||
shuffled.shuffle(&mut rng);
|
||||
|
||||
return shuffled.iter().collect();
|
||||
}
|
||||
|
||||
let admin_password = generate_secure_password(16);
|
||||
|
||||
// --- Step 3: Create masterkey secret ------------------------------------
|
||||
|
||||
debug!(
|
||||
@@ -179,17 +229,20 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
|
||||
MASTERKEY_SECRET_NAME, NAMESPACE
|
||||
);
|
||||
|
||||
|
||||
// Masterkey for symmetric encryption — must be exactly 32 ASCII bytes.
|
||||
let masterkey: String = thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
// Masterkey for symmetric encryption — must be exactly 32 ASCII bytes (alphanumeric only).
|
||||
let masterkey = rng()
|
||||
.sample_iter(&rand::distr::Alphanumeric)
|
||||
.take(32)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
let masterkey_bytes = BASE64_STANDARD.encode(&masterkey);
|
||||
.collect::<String>();
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Created masterkey secret '{}' in namespace '{}'",
|
||||
MASTERKEY_SECRET_NAME, NAMESPACE
|
||||
);
|
||||
|
||||
let mut masterkey_data: BTreeMap<String, ByteString> = BTreeMap::new();
|
||||
masterkey_data.insert("masterkey".to_string(), ByteString(masterkey_bytes.into()));
|
||||
masterkey_data.insert("masterkey".to_string(), ByteString(masterkey.into()));
|
||||
|
||||
let masterkey_secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
@@ -201,43 +254,65 @@ impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for Zitade
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
topology
|
||||
match topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))
|
||||
.create(masterkey_secret)
|
||||
.await?;
|
||||
|
||||
K8sResourceScore::single(masterkey_secret, Some(NAMESPACE.to_string()))
|
||||
.interpret(inventory, topology)
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
|
||||
.create(&masterkey_secret, Some(NAMESPACE))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
let msg = format!("[Zitadel] Failed to create masterkey secret: {e}");
|
||||
error!("{msg}");
|
||||
InterpretError::new(msg)
|
||||
})?;
|
||||
|
||||
{
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"[Zitadel] Masterkey secret '{}' created",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
}
|
||||
Err(KubeError::Api(ErrorResponse { code: 409, .. })) => {
|
||||
info!(
|
||||
"[Zitadel] Masterkey secret '{}' already exists, leaving it untouched",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
}
|
||||
Err(other) => {
|
||||
let msg = format!(
|
||||
"[Zitadel] Failed to create masterkey secret '{}': {other}",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
error!("{msg}");
|
||||
return Err(InterpretError::new(msg));
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Masterkey secret '{}' created successfully",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
|
||||
// --- Step 4: Build Helm values ------------------------------------
|
||||
|
||||
warn!(
|
||||
"[Zitadel] No ingress controller annotations are set. \
|
||||
Add controller-specific annotations for your distribution: \
|
||||
NGINX → 'nginx.ingress.kubernetes.io/backend-protocol: GRPC'; \
|
||||
OpenShift HAProxy → 'haproxy.router.openshift.io/*' or use Routes; \
|
||||
AWS ALB → set ingress.controller=aws."
|
||||
"[Zitadel] Applying TLS-enabled ingress defaults for OKD/OpenShift. \
|
||||
cert-manager annotations are included as optional hints and are \
|
||||
ignored on clusters without cert-manager."
|
||||
);
|
||||
|
||||
let values_yaml = format!(
|
||||
r#"zitadel:
|
||||
r#"image:
|
||||
tag: {zitadel_version}
|
||||
zitadel:
|
||||
masterkeySecretName: "{MASTERKEY_SECRET_NAME}"
|
||||
configmapConfig:
|
||||
ExternalDomain: "{host}"
|
||||
ExternalSecure: true
|
||||
FirstInstance:
|
||||
Org:
|
||||
Human:
|
||||
UserName: "admin"
|
||||
Password: "{admin_password}"
|
||||
FirstName: "Zitadel"
|
||||
LastName: "Admin"
|
||||
Email: "admin@zitadel.example.com"
|
||||
PasswordChangeRequired: true
|
||||
TLS:
|
||||
Enabled: false
|
||||
Database:
|
||||
@@ -333,12 +408,19 @@ setupJob:
|
||||
type: RuntimeDefault
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations: {{}}
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
route.openshift.io/termination: edge
|
||||
hosts:
|
||||
- host: "{host}"
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- "{host}"
|
||||
secretName: "{host}-tls"
|
||||
|
||||
login:
|
||||
enabled: true
|
||||
podSecurityContext:
|
||||
@@ -359,12 +441,19 @@ login:
|
||||
type: RuntimeDefault
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations: {{}}
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
route.openshift.io/termination: edge
|
||||
hosts:
|
||||
- host: "{host}"
|
||||
paths:
|
||||
- path: /ui/v2/login
|
||||
pathType: Prefix"#
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- "{host}"
|
||||
secretName: "{host}-tls""#,
|
||||
zitadel_version = self.zitadel_version
|
||||
);
|
||||
|
||||
trace!("[Zitadel] Helm values YAML:\n{values_yaml}");
|
||||
@@ -394,7 +483,17 @@ login:
|
||||
.await;
|
||||
|
||||
match &result {
|
||||
Ok(_) => info!("[Zitadel] Helm chart deployed successfully"),
|
||||
Ok(_) => info!(
|
||||
"[Zitadel] Helm chart deployed successfully\n\n\
|
||||
===== ZITADEL DEPLOYMENT COMPLETE =====\n\
|
||||
Login URL: https://{host}\n\
|
||||
Username: admin@zitadel.{host}\n\
|
||||
Password: {admin_password}\n\n\
|
||||
IMPORTANT: The password is saved in ConfigMap 'zitadel-config-yaml'\n\
|
||||
and must be changed on first login. Save the credentials in a\n\
|
||||
secure location after changing them.\n\
|
||||
========================================="
|
||||
),
|
||||
Err(e) => error!("[Zitadel] Helm chart deployment failed: {e}"),
|
||||
}
|
||||
|
||||
|
||||
@@ -13,3 +13,5 @@ env_logger.workspace = true
|
||||
log.workspace = true
|
||||
tokio.workspace = true
|
||||
reqwest.workspace = true
|
||||
chrono.workspace = true
|
||||
tower = "0.5.3"
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
|
||||
Designed for **bare-metal Kubernetes clusters** with external load balancers (HAProxy, OPNsense, F5, etc.).
|
||||
|
||||
It exposes a simple, reliable HTTP endpoint (`/health`) on each node that returns:
|
||||
Exposes a simple HTTP endpoint (`/health`) on each node:
|
||||
|
||||
- **200 OK** — node is healthy and ready to receive traffic
|
||||
- **503 Service Unavailable** — node should be removed from the load balancer pool
|
||||
- **500 Internal Server Error** — misconfiguration (e.g. `NODE_NAME` not set)
|
||||
|
||||
This project is **not dependent on Harmony**, but is commonly used as part of Harmony bare-metal Kubernetes deployments.
|
||||
|
||||
@@ -16,199 +17,181 @@ This project is **not dependent on Harmony**, but is commonly used as part of Ha
|
||||
In bare-metal environments, external load balancers often rely on pod-level or router-level checks that can lag behind the authoritative Kubernetes `Node.status.conditions[Ready]`.
|
||||
This service provides the true source-of-truth with fast reaction time.
|
||||
|
||||
## Features & Roadmap
|
||||
## Available checks
|
||||
|
||||
| Check | Description | Status | Check Name |
|
||||
|------------------------------------|--------------------------------------------------|---------------------|--------------------|
|
||||
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | `node_ready` |
|
||||
| **OKD Router health** | Probes OpenShift router healthz on port 1936 | **Implemented** | `okd_router_1936` |
|
||||
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | `filesystem_ro` |
|
||||
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | `kubelet` |
|
||||
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | `container_runtime`|
|
||||
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | `disk_pressure` |
|
||||
| Network reachability | DNS resolution + gateway connectivity | To be implemented | `network` |
|
||||
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | `custom_conditions`|
|
||||
| Check name | Description | Status |
|
||||
|--------------------|-------------------------------------------------------------|-------------------|
|
||||
| `node_ready` | Queries `Node.status.conditions[Ready]` via Kubernetes API | Implemented |
|
||||
| `okd_router_1936` | Probes OpenShift router `/healthz/ready` on port 1936 | Implemented |
|
||||
| `filesystem_ro` | Detects read-only mounts via `/proc/mounts` | To be implemented |
|
||||
| `kubelet` | Local probe to kubelet `/healthz` (port 10248) | To be implemented |
|
||||
| `container_runtime`| Socket check + runtime status | To be implemented |
|
||||
| `disk_pressure` | Threshold checks on key filesystems | To be implemented |
|
||||
| `network` | DNS resolution + gateway connectivity | To be implemented |
|
||||
| `custom_conditions`| Reacts to extra conditions (NPD, etc.) | To be implemented |
|
||||
|
||||
All checks are combined with logical **AND** — any failure results in 503.
|
||||
All checks are combined with logical **AND** — any single failure results in 503.
|
||||
|
||||
## Behavior
|
||||
|
||||
### `node_ready` check — fail-open design
|
||||
|
||||
The `node_ready` check queries the Kubernetes API server to read `Node.status.conditions[Ready]`.
|
||||
Because this service runs on the node it is checking, there are scenarios where the API server is temporarily
|
||||
unreachable (e.g. during a control-plane restart). To avoid incorrectly draining a healthy node in such cases,
|
||||
the check is **fail-open**: it passes (reports ready) whenever the Kubernetes API is unavailable.
|
||||
|
||||
| Situation | Result | HTTP status |
|
||||
|------------------------------------------------------|-------------------|-------------|
|
||||
| `Node.conditions[Ready] == True` | Pass | 200 |
|
||||
| `Node.conditions[Ready] == False` | Fail | 503 |
|
||||
| `Ready` condition absent | Fail | 503 |
|
||||
| API server unreachable or timed out (1 s timeout) | Pass (assumes ready) | 200 |
|
||||
| Kubernetes client initialization failed | Pass (assumes ready) | 200 |
|
||||
| `NODE_NAME` env var not set | Hard error | 500 |
|
||||
|
||||
A warning is logged whenever the API is unavailable and the check falls back to assuming ready.
|
||||
|
||||
### `okd_router_1936` check
|
||||
|
||||
Sends `GET http://127.0.0.1:1936/healthz/ready` with a 5-second timeout.
|
||||
Returns pass on any 2xx response, fail otherwise.
|
||||
|
||||
### Unknown check names
|
||||
|
||||
Requesting an unknown check name (e.g. `check=bogus`) results in that check returning `passed: false`
|
||||
with reason `"Unknown check: bogus"`, and the overall response is 503.
|
||||
|
||||
## How it works
|
||||
|
||||
### Node Name Discovery
|
||||
The service automatically discovers its own node name using the **Kubernetes Downward API**:
|
||||
### Node name discovery
|
||||
|
||||
The service reads the `NODE_NAME` environment variable, which must be injected via the Kubernetes Downward API:
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
fieldPath: spec.nodeName
|
||||
```
|
||||
|
||||
### Kubernetes API Authentication
|
||||
### Kubernetes API authentication
|
||||
|
||||
- Uses standard **in-cluster configuration** (no external credentials needed).
|
||||
- The ServiceAccount token and CA certificate are automatically mounted by Kubernetes at `/var/run/secrets/kubernetes.io/serviceaccount/`.
|
||||
- The application (via `kube-rs` or your Harmony higher-level client) calls the equivalent of `Config::incluster_config()`.
|
||||
- Requires only minimal RBAC: `get` permission on the `nodes` resource (see `deploy/rbac.yaml`).
|
||||
- Uses standard **in-cluster configuration** — no external credentials needed.
|
||||
- The ServiceAccount token and CA certificate are automatically mounted at `/var/run/secrets/kubernetes.io/serviceaccount/`.
|
||||
- Requires only minimal RBAC: `get` and `list` on the `nodes` resource (see `deploy/resources.yaml`).
|
||||
- Connect and write timeouts are set to **1 second** to keep checks fast.
|
||||
|
||||
## Quick Start
|
||||
## Deploy
|
||||
|
||||
All Kubernetes resources (Namespace, ServiceAccount, ClusterRole, ClusterRoleBinding, and an OpenShift SCC RoleBinding for `hostnetwork`) are in a single file.
|
||||
|
||||
### 1. Build and push
|
||||
```bash
|
||||
cargo build --release --bin harmony-node-readiness-endpoint
|
||||
|
||||
docker build -t your-registry/harmony-node-readiness-endpoint:v1.0.0 .
|
||||
docker push your-registry/harmony-node-readiness-endpoint:v1.0.0
|
||||
```
|
||||
|
||||
### 2. Deploy
|
||||
```bash
|
||||
kubectl apply -f deploy/namespace.yaml
|
||||
kubectl apply -f deploy/rbac.yaml
|
||||
kubectl apply -f deploy/resources.yaml
|
||||
kubectl apply -f deploy/daemonset.yaml
|
||||
```
|
||||
|
||||
(The DaemonSet uses `hostPort: 25001` by default so the endpoint is reachable directly on the node's IP.)
|
||||
The DaemonSet uses `hostNetwork: true` and `hostPort: 25001`, so the endpoint is reachable directly on the node's IP at port 25001.
|
||||
It tolerates all taints, ensuring it runs even on nodes marked unschedulable.
|
||||
|
||||
### 3. Configure your external load balancer
|
||||
### Configure your external load balancer
|
||||
|
||||
**Example for HAProxy / OPNsense:**
|
||||
- Check type: **HTTP**
|
||||
- URI: `/health`
|
||||
- Port: `25001` (configurable via `LISTEN_PORT`)
|
||||
- Port: `25001` (configurable via `LISTEN_PORT` env var)
|
||||
- Interval: 5–10 s
|
||||
- Rise: 2
|
||||
- Fall: 3
|
||||
- Expect: `2xx`
|
||||
|
||||
## Health Endpoint Examples
|
||||
## Endpoint usage
|
||||
|
||||
### Query Parameter
|
||||
### Query parameter
|
||||
|
||||
Use the `check` query parameter to specify which checks to run. Multiple checks can be comma-separated.
|
||||
Use the `check` query parameter to select which checks to run (comma-separated).
|
||||
When omitted, only `node_ready` runs.
|
||||
|
||||
| Request | Behavior |
|
||||
|--------------------------------------|---------------------------------------------|
|
||||
| `GET /health` | Runs `node_ready` (default) |
|
||||
| `GET /health?check=okd_router_1936` | Runs only OKD router check |
|
||||
| `GET /health?check=node_ready,okd_router_1936` | Runs both checks |
|
||||
| Request | Checks run |
|
||||
|------------------------------------------------|-----------------------------------|
|
||||
| `GET /health` | `node_ready` |
|
||||
| `GET /health?check=okd_router_1936` | `okd_router_1936` only |
|
||||
| `GET /health?check=node_ready,okd_router_1936` | `node_ready` and `okd_router_1936`|
|
||||
|
||||
**Note:** When the `check` parameter is provided, only the specified checks run. You must explicitly include `node_ready` if you want it along with other checks.
|
||||
> **Note:** specifying `check=` replaces the default. Include `node_ready` explicitly if you need it alongside other checks.
|
||||
|
||||
### Response Format
|
||||
|
||||
Each check result includes:
|
||||
- `name`: The check identifier
|
||||
- `passed`: Boolean indicating success or failure
|
||||
- `reason`: (Optional) Failure reason if the check failed
|
||||
- `duration_ms`: Time taken to execute the check in milliseconds
|
||||
|
||||
**Healthy node (default check)**
|
||||
```http
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/json
|
||||
### Response format
|
||||
|
||||
```json
|
||||
{
|
||||
|
||||
"status": "ready" | "not-ready",
|
||||
"checks": [
|
||||
{
|
||||
GET /health?check=node_ready,okd_router_1936
|
||||
|
||||
"name": "<check-name>",
|
||||
"passed": true | false,
|
||||
"reason": "<failure reason, omitted on success>",
|
||||
"duration_ms": 42
|
||||
}
|
||||
|
||||
],
|
||||
"total_duration_ms": 42
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
**Healthy node (default)**
|
||||
```http
|
||||
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
HTTP/1.1 200 OK
|
||||
|
||||
|
||||
{
|
||||
"status": "ready",
|
||||
```http
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
Content-Type: application/json
|
||||
|
||||
```
|
||||
|
||||
## Configuration (via DaemonSet env vars)
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 42 }],
|
||||
"total_duration_ms": 42
|
||||
}
|
||||
```
|
||||
|
||||
value: "25001"
|
||||
**Unhealthy node**
|
||||
```http
|
||||
|
||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
## Development
|
||||
|
||||
{
|
||||
"status": "not-ready",
|
||||
"checks": [
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
||||
|
||||
"name": "okd_router_1936",
|
||||
"passed": false,
|
||||
"reason": "Failed to connect to OKD router: connection refused",
|
||||
"duration_ms": 5
|
||||
}
|
||||
]
|
||||
{ "name": "node_ready", "passed": false, "reason": "KubeletNotReady", "duration_ms": 35 }
|
||||
],
|
||||
"total_duration_ms": 35
|
||||
}
|
||||
```
|
||||
|
||||
**Unhealthy node (default check)**
|
||||
**API server unreachable (fail-open)**
|
||||
```http
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
Content-Type: application/json
|
||||
HTTP/1.1 200 OK
|
||||
|
||||
{
|
||||
"status": "not-ready",
|
||||
"checks": [
|
||||
{
|
||||
"name": "node_ready",
|
||||
"passed": false,
|
||||
"reason": "KubeletNotReady",
|
||||
"duration_ms": 35
|
||||
}
|
||||
]
|
||||
"status": "ready",
|
||||
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 1001 }],
|
||||
"total_duration_ms": 1001
|
||||
}
|
||||
```
|
||||
*(A warning is logged: `Kubernetes API appears to be down … Assuming node is ready.`)*
|
||||
|
||||
## Configuration (via DaemonSet env vars)
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: LISTEN_PORT
|
||||
value: "25001"
|
||||
```
|
||||
|
||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
||||
| Env var | Default | Description |
|
||||
|---------------|----------|--------------------------------------|
|
||||
| `NODE_NAME` | required | Node name, injected via Downward API |
|
||||
| `LISTEN_PORT` | `25001` | TCP port the HTTP server binds to |
|
||||
| `RUST_LOG` | — | Log level (e.g. `info`, `debug`) |
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
# Run locally (set NODE_NAME env var)
|
||||
# Run locally
|
||||
NODE_NAME=my-test-node cargo run
|
||||
|
||||
# Run tests
|
||||
cargo test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
||||
|
||||
|
||||
0
harmony_node_readiness/build-docker.sh
Normal file → Executable file
0
harmony_node_readiness/build-docker.sh
Normal file → Executable file
@@ -27,8 +27,8 @@ spec:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
hostPort: 8080
|
||||
- containerPort: 25001
|
||||
hostPort: 25001
|
||||
name: health-port
|
||||
resources:
|
||||
requests:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user