Compare commits
32 Commits
feat/clust
...
worktree-b
| Author | SHA1 | Date | |
|---|---|---|---|
| 82d1f87ff8 | |||
| 9a67bcc96f | |||
| a377fc1404 | |||
| c9977fee12 | |||
| 64bf585e07 | |||
| 44e2c45435 | |||
| cdccbc8939 | |||
| 9830971d05 | |||
| e1183ef6de | |||
| 444fea81b8 | |||
| 907ae04195 | |||
| 64582caa64 | |||
| f5736fcc37 | |||
| 7a1e84fb68 | |||
| 8499f4d1b7 | |||
| 231d9b878e | |||
| ee2dade0be | |||
| aa07f4c8ad | |||
| 77bb138497 | |||
| a16879b1b6 | |||
| f57e6f5957 | |||
| 7605d05de3 | |||
| b244127843 | |||
| 67c3265286 | |||
| de49e9ebcc | |||
| d8ab9d52a4 | |||
| 2cb7aeefc0 | |||
| 16016febcf | |||
| e709de531d | |||
| 6ab0f3a6ab | |||
| 724ab0b888 | |||
| 8b6ce8d069 |
@@ -15,4 +15,4 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run check script
|
||||
run: bash check.sh
|
||||
run: bash build/check.sh
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -29,3 +29,6 @@ Cargo.lock
|
||||
|
||||
# Useful to create ignore folders for temp files and notes
|
||||
ignore
|
||||
|
||||
# Generated book
|
||||
book
|
||||
|
||||
731
Cargo.lock
generated
731
Cargo.lock
generated
@@ -249,15 +249,6 @@ dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.6.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
@@ -297,6 +288,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b3568b48b7cefa6b8ce125f9bb4989e52fbcc29ebea88df04cc7c5f12f70455"
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.21"
|
||||
@@ -686,7 +683,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -718,6 +715,41 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brocade-snmp-server"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"brocade",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brocade-switch"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"brocade",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.2"
|
||||
@@ -780,12 +812,6 @@ dependencies = [
|
||||
"bytes",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "c_linked_list"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4964518bd3b4a8190e832886cdc0da9794f12e8e6c1613a9e90ff331c4c8724b"
|
||||
|
||||
[[package]]
|
||||
name = "camino"
|
||||
version = "1.2.2"
|
||||
@@ -871,6 +897,22 @@ dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cert_manager"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"assert_cmd",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
@@ -1226,7 +1268,7 @@ dependencies = [
|
||||
"parking_lot",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1242,7 +1284,7 @@ dependencies = [
|
||||
"parking_lot",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1259,7 +1301,7 @@ dependencies = [
|
||||
"rustix 0.38.44",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1268,7 +1310,7 @@ version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
|
||||
dependencies = [
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1669,19 +1711,6 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dmidecode"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4e529c1bd93d69804dc1e0a0c73aacd12bb13c7a18c659497411abdc6acf5e5f"
|
||||
dependencies = [
|
||||
"aho-corasick 0.6.10",
|
||||
"bitflags 1.3.2",
|
||||
"failure",
|
||||
"failure_derive",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dockerfile_builder"
|
||||
version = "0.1.6"
|
||||
@@ -1929,6 +1958,457 @@ dependencies = [
|
||||
name = "example"
|
||||
version = "0.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "example-application-monitoring-with-tenant"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_types",
|
||||
"logging",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-cli"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"assert_cmd",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-k8s-drain-node"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"assert_cmd",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony-k8s",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"inquire 0.7.5",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-k8s-write-file-on-node"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"assert_cmd",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony-k8s",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"inquire 0.7.5",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-kube-rs"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_macros",
|
||||
"http 1.4.0",
|
||||
"inquire 0.7.5",
|
||||
"k8s-openapi",
|
||||
"kube",
|
||||
"log",
|
||||
"serde_yaml",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-lamp"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-monitoring"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-monitoring-with-tenant"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_types",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-multisite-postgres"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-nats"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-nats-module-supercluster"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"k8s-openapi",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-nats-supercluster"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"k8s-openapi",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-node-health"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-ntfy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-okd-cluster-alerts"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"brocade",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_secret_derive",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-okd-install"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"brocade",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_secret_derive",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"schemars 0.8.22",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-openbao"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-operatorhub-catalogsource"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-opnsense"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"brocade",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"schemars 0.8.22",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-opnsense-node-exporter"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_secret_derive",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-postgresql"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-public-postgres"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-pxe"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"brocade",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_secret_derive",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"schemars 0.8.22",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-remove-rook-osd"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-rust"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-tenant"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-try-rust-webapp"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-tui"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_macros",
|
||||
"harmony_tui",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example-zitadel"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "example_validate_ceph_cluster_health"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "eyre"
|
||||
version = "0.6.12"
|
||||
@@ -1939,28 +2419,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "failure"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"failure_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "failure_derive"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
"synstructure 0.12.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
@@ -2187,12 +2645,6 @@ dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gcc"
|
||||
version = "0.3.55"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2"
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -2204,28 +2656,6 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "get_if_addrs"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "abddb55a898d32925f3148bd281174a68eeb68bbfd9a5938a57b18f506ee4ef7"
|
||||
dependencies = [
|
||||
"c_linked_list",
|
||||
"get_if_addrs-sys",
|
||||
"libc",
|
||||
"winapi 0.2.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "get_if_addrs-sys"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d04f9fb746cf36b191c00f3ede8bde9c8e64f9f4b05ae2694a9ccf5e3f5ab48"
|
||||
dependencies = [
|
||||
"gcc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
@@ -2540,6 +2970,36 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_config"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"directories",
|
||||
"harmony_config_derive",
|
||||
"harmony_secret",
|
||||
"inquire 0.7.5",
|
||||
"interactive-parse",
|
||||
"log",
|
||||
"pretty_assertions",
|
||||
"schemars 0.8.22",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_config_derive"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_execution"
|
||||
version = "0.1.0"
|
||||
@@ -2560,7 +3020,7 @@ dependencies = [
|
||||
"harmony_types",
|
||||
"local-ip-address",
|
||||
"log",
|
||||
"mdns-sd 0.14.1 (git+https://github.com/jggc/mdns-sd.git?branch=patch-1)",
|
||||
"mdns-sd",
|
||||
"reqwest 0.12.28",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2569,6 +3029,19 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_inventory_builder"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_macros"
|
||||
version = "0.1.0"
|
||||
@@ -2945,7 +3418,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3732,6 +4205,15 @@ dependencies = [
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "logging"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "461a8beca676e8ab1bd468c92e9b4436d6368e11e96ae038209e520cfe665e46"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.5"
|
||||
@@ -3763,36 +4245,6 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "mdns"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dmidecode",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"get_if_addrs",
|
||||
"local-ip-address",
|
||||
"log",
|
||||
"mdns-sd 0.14.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdns-sd"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e0a59b04e17a195b0674198b3182931801c4759d00f36acad51b5a97210a692"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"flume",
|
||||
"if-addrs",
|
||||
"log",
|
||||
"mio 1.1.1",
|
||||
"socket-pktinfo",
|
||||
"socket2 0.6.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdns-sd"
|
||||
version = "0.14.1"
|
||||
@@ -3930,7 +4382,7 @@ version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
|
||||
dependencies = [
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4831,7 +5283,7 @@ version = "1.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
|
||||
dependencies = [
|
||||
"aho-corasick 1.1.4",
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
@@ -4843,7 +5295,7 @@ version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
||||
dependencies = [
|
||||
"aho-corasick 1.1.4",
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
@@ -4954,6 +5406,21 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rhob-application-monitoring"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
@@ -5039,7 +5506,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fadd2c0ab350e21c66556f94ee06f766d8bdae3213857ba7610bfd8e10e51880"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi 0.3.9",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6208,6 +6675,26 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sttest"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"brocade",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
"harmony_secret_derive",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"schemars 0.8.22",
|
||||
"serde",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
@@ -7220,12 +7707,6 @@ dependencies = [
|
||||
"wasite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"examples/*",
|
||||
"private_repos/*",
|
||||
"harmony",
|
||||
"harmony_types",
|
||||
@@ -15,10 +16,13 @@ members = [
|
||||
"harmony_inventory_agent",
|
||||
"harmony_secret_derive",
|
||||
"harmony_secret",
|
||||
"adr/agent_discovery/mdns",
|
||||
"harmony_config_derive",
|
||||
"harmony_config",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
|
||||
"harmony_agent/deploy",
|
||||
"harmony_node_readiness",
|
||||
"harmony-k8s",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
||||
272
README.md
272
README.md
@@ -1,101 +1,121 @@
|
||||
# Harmony
|
||||
|
||||
Open-source infrastructure orchestration that treats your platform like first-class code.
|
||||
**Infrastructure orchestration that treats your platform like first-class code.**
|
||||
|
||||
In other words, Harmony is a **next-generation platform engineering framework**.
|
||||
Harmony is an open-source framework that brings the rigor of software engineering to infrastructure management. Write Rust code to define what you want, and Harmony handles the rest — from local development to production clusters.
|
||||
|
||||
_By [NationTech](https://nationtech.io)_
|
||||
|
||||
[](https://git.nationtech.io/nationtech/harmony)
|
||||
[](https://git.nationtech.io/NationTech/harmony)
|
||||
[](LICENSE)
|
||||
|
||||
### Unify
|
||||
---
|
||||
|
||||
- **Project Scaffolding**
|
||||
- **Infrastructure Provisioning**
|
||||
- **Application Deployment**
|
||||
- **Day-2 operations**
|
||||
## The Problem Harmony Solves
|
||||
|
||||
All in **one strongly-typed Rust codebase**.
|
||||
Modern infrastructure is messy. Your Kubernetes cluster needs monitoring. Your bare-metal servers need provisioning. Your applications need deployments. Each comes with its own tooling, its own configuration format, and its own failure modes.
|
||||
|
||||
### Deploy anywhere
|
||||
**What if you could describe your entire platform in one consistent language?**
|
||||
|
||||
From a **developer laptop** to a **global production cluster**, a single **source of truth** drives the **full software lifecycle.**
|
||||
That's Harmony. It unifies project scaffolding, infrastructure provisioning, application deployment, and day-2 operations into a single strongly-typed Rust codebase.
|
||||
|
||||
## The Harmony Philosophy
|
||||
---
|
||||
|
||||
Infrastructure is essential, but it shouldn’t be your core business. Harmony is built on three guiding principles that make modern platforms reliable, repeatable, and easy to reason about.
|
||||
## Three Principles That Make the Difference
|
||||
|
||||
| Principle | What it means for you |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| **Infrastructure as Resilient Code** | Replace sprawling YAML and bash scripts with type-safe Rust. Test, refactor, and version your platform just like application code. |
|
||||
| **Prove It Works — Before You Deploy** | Harmony uses the compiler to verify that your application’s needs match the target environment’s capabilities at **compile-time**, eliminating an entire class of runtime outages. |
|
||||
| **One Unified Model** | Software and infrastructure are a single system. Harmony models them together, enabling deep automation—from bare-metal servers to Kubernetes workloads—with zero context switching. |
|
||||
| Principle | What It Means |
|
||||
|-----------|---------------|
|
||||
| **Infrastructure as Resilient Code** | Stop fighting with YAML and bash. Write type-safe Rust that you can test, version, and refactor like any other code. |
|
||||
| **Prove It Works Before You Deploy** | Harmony verifies at _compile time_ that your application can actually run on your target infrastructure. No more "the config looks right but it doesn't work" surprises. |
|
||||
| **One Unified Model** | Software and infrastructure are one system. Deploy from laptop to production cluster without switching contexts or tools. |
|
||||
|
||||
These principles surface as simple, ergonomic Rust APIs that let teams focus on their product while trusting the platform underneath.
|
||||
---
|
||||
|
||||
## Where to Start
|
||||
## How It Works: The Core Concepts
|
||||
|
||||
We have a comprehensive set of documentation right here in the repository.
|
||||
Harmony is built around three concepts that work together:
|
||||
|
||||
| I want to... | Start Here |
|
||||
| ----------------- | ------------------------------------------------------------------ |
|
||||
| Get Started | [Getting Started Guide](./docs/guides/getting-started.md) |
|
||||
| See an Example | [Use Case: Deploy a Rust Web App](./docs/use-cases/rust-webapp.md) |
|
||||
| Explore | [Documentation Hub](./docs/README.md) |
|
||||
| See Core Concepts | [Core Concepts Explained](./docs/concepts.md) |
|
||||
### Score — "What You Want"
|
||||
|
||||
## Quick Look: Deploy a Rust Webapp
|
||||
A `Score` is a declarative description of desired state. Think of it as a "recipe" that says _what_ you want without specifying _how_ to get there.
|
||||
|
||||
The snippet below spins up a complete **production-grade Rust + Leptos Webapp** with monitoring. Swap it for your own scores to deploy anything from microservices to machine-learning pipelines.
|
||||
```rust
|
||||
// "I want a PostgreSQL cluster running with default settings"
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
### Topology — "Where It Goes"
|
||||
|
||||
A `Topology` represents your infrastructure environment and its capabilities. It answers the question: "What can this environment actually do?"
|
||||
|
||||
```rust
|
||||
// Deploy to a local K3D cluster, or any Kubernetes cluster via environment variables
|
||||
K8sAnywhereTopology::from_env()
|
||||
```
|
||||
|
||||
### Interpret — "How It Happens"
|
||||
|
||||
An `Interpret` is the execution logic that connects your `Score` to your `Topology`. It translates "what you want" into "what the infrastructure does."
|
||||
|
||||
**The Compile-Time Check:** Before your code ever runs, Harmony verifies that your `Score` is compatible with your `Topology`. If your application needs a feature your infrastructure doesn't provide, you get a compile error — not a runtime failure.
|
||||
|
||||
---
|
||||
|
||||
## What You Can Deploy
|
||||
|
||||
Harmony ships with ready-made Scores for:
|
||||
|
||||
**Data Services**
|
||||
- PostgreSQL clusters (via CloudNativePG operator)
|
||||
- Multi-site PostgreSQL with failover
|
||||
|
||||
**Kubernetes**
|
||||
- Namespaces, Deployments, Ingress
|
||||
- Helm charts
|
||||
- cert-manager for TLS
|
||||
- Monitoring (Prometheus, alerting, ntfy)
|
||||
|
||||
**Bare Metal / Infrastructure**
|
||||
- OKD clusters from scratch
|
||||
- OPNsense firewalls
|
||||
- Network services (DNS, DHCP, TFTP)
|
||||
- Brocade switch configuration
|
||||
|
||||
**And more:** Application deployment, tenant management, load balancing, and more.
|
||||
|
||||
---
|
||||
|
||||
## Quick Start: Deploy a PostgreSQL Cluster
|
||||
|
||||
This example provisions a local Kubernetes cluster (K3D) and deploys a PostgreSQL cluster on it — no external infrastructure required.
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{
|
||||
application::{
|
||||
ApplicationScore, RustWebFramework, RustWebapp,
|
||||
features::{PackagingDeployment, rhob_monitoring::Monitoring},
|
||||
},
|
||||
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
},
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
use harmony_macros::hurl;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let application = Arc::new(RustWebapp {
|
||||
name: "harmony-example-leptos".to_string(),
|
||||
project_root: PathBuf::from(".."), // <== Your project root, usually .. if you use the standard `/harmony` folder
|
||||
framework: Some(RustWebFramework::Leptos),
|
||||
service_port: 8080,
|
||||
});
|
||||
|
||||
// Define your Application deployment and the features you want
|
||||
let app = ApplicationScore {
|
||||
features: vec![
|
||||
Box::new(PackagingDeployment {
|
||||
application: application.clone(),
|
||||
}),
|
||||
Box::new(Monitoring {
|
||||
application: application.clone(),
|
||||
alert_receiver: vec![
|
||||
Box::new(DiscordWebhook {
|
||||
name: "test-discord".to_string(),
|
||||
url: hurl!("https://discord.doesnt.exist.com"), // <== Get your discord webhook url
|
||||
}),
|
||||
],
|
||||
}),
|
||||
],
|
||||
application,
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned local k3d by default or connect to any kubernetes cluster
|
||||
vec![Box::new(app)],
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
@@ -103,40 +123,128 @@ async fn main() {
|
||||
}
|
||||
```
|
||||
|
||||
To run this:
|
||||
### What this actually does
|
||||
|
||||
- Clone the repository: `git clone https://git.nationtech.io/nationtech/harmony`
|
||||
- Install dependencies: `cargo build --release`
|
||||
- Run the example: `cargo run --example try_rust_webapp`
|
||||
When you compile and run this program:
|
||||
|
||||
1. **Compiles** the Harmony Score into an executable
|
||||
2. **Connects** to `K8sAnywhereTopology` — which auto-provisions a local K3D cluster if none exists
|
||||
3. **Installs** the CloudNativePG operator into the cluster (one-time setup)
|
||||
4. **Creates** a PostgreSQL cluster with 1 instance and 1 GiB of storage
|
||||
5. **Exposes** the PostgreSQL instance as a Kubernetes Service
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- [Rust](https://rust-lang.org/tools/install) (edition 2024)
|
||||
- [Docker](https://docs.docker.com/get-docker/) (for the local K3D cluster)
|
||||
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (optional, for inspecting the cluster)
|
||||
|
||||
### Run it
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://git.nationtech.io/nationtech/harmony
|
||||
cd harmony
|
||||
|
||||
# Build the project
|
||||
cargo build --release
|
||||
|
||||
# Run the example
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
Harmony will print its progress as it sets up the cluster and deploys PostgreSQL. When complete, you can inspect the deployment:
|
||||
|
||||
```bash
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
kubectl get secret -n harmony-postgres-example harmony-postgres-example-db-user -o jsonpath='{.data.password}' | base64 -d
|
||||
```
|
||||
|
||||
To connect to the database, forward the port:
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
psql -h localhost -p 5432 -U postgres
|
||||
```
|
||||
|
||||
To clean up, delete the K3D cluster:
|
||||
```bash
|
||||
k3d cluster delete harmony-postgres-example
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables
|
||||
|
||||
`K8sAnywhereTopology::from_env()` reads the following environment variables to determine where and how to connect:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `KUBECONFIG` | `~/.kube/config` | Path to your kubeconfig file |
|
||||
| `HARMONY_AUTOINSTALL` | `true` | Auto-provision a local K3D cluster if none found |
|
||||
| `HARMONY_USE_LOCAL_K3D` | `true` | Always prefer local K3D over remote clusters |
|
||||
| `HARMONY_PROFILE` | `dev` | Deployment profile: `dev`, `staging`, or `prod` |
|
||||
| `HARMONY_K8S_CONTEXT` | _none_ | Use a specific kubeconfig context |
|
||||
| `HARMONY_PUBLIC_DOMAIN` | _none_ | Public domain for ingress endpoints |
|
||||
|
||||
To connect to an existing Kubernetes cluster instead of provisioning K3D:
|
||||
|
||||
```bash
|
||||
# Point to your kubeconfig
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
# Then run
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Documentation
|
||||
|
||||
All documentation is in the `/docs` directory.
|
||||
| I want to... | Start here |
|
||||
|--------------|------------|
|
||||
| Understand the core concepts | [Core Concepts](./docs/concepts.md) |
|
||||
| Deploy my first application | [Getting Started Guide](./docs/guides/getting-started.md) |
|
||||
| Explore available components | [Scores Catalog](./docs/catalogs/scores.md) · [Topologies Catalog](./docs/catalogs/topologies.md) |
|
||||
| See a complete bare-metal deployment | [OKD on Bare Metal](./docs/use-cases/okd-on-bare-metal.md) |
|
||||
| Build my own Score or Topology | [Developer Guide](./docs/guides/developer-guide.md) |
|
||||
|
||||
- [Documentation Hub](./docs/README.md): The main entry point for all documentation.
|
||||
- [Core Concepts](./docs/concepts.md): A detailed look at Score, Topology, Capability, Inventory, and Interpret.
|
||||
- [Component Catalogs](./docs/catalogs/README.md): Discover all available Scores, Topologies, and Capabilities.
|
||||
- [Developer Guide](./docs/guides/developer-guide.md): Learn how to write your own Scores and Topologies.
|
||||
---
|
||||
|
||||
## Architectural Decision Records
|
||||
## Why Rust?
|
||||
|
||||
- [ADR-001 · Why Rust](adr/001-rust.md)
|
||||
- [ADR-003 · Infrastructure Abstractions](adr/003-infrastructure-abstractions.md)
|
||||
- [ADR-006 · Secret Management](adr/006-secret-management.md)
|
||||
- [ADR-011 · Multi-Tenant Cluster](adr/011-multi-tenant-cluster.md)
|
||||
We chose Rust for the same reason you might: **reliability through type safety**.
|
||||
|
||||
## Contribute
|
||||
Infrastructure code runs in production. It needs to be correct. Rust's ownership model and type system let us build a framework where:
|
||||
|
||||
Discussions and roadmap live in [Issues](https://git.nationtech.io/nationtech/harmony/-/issues). PRs, ideas, and feedback are welcome!
|
||||
- Invalid configurations fail at compile time, not at 3 AM
|
||||
- Refactoring infrastructure is as safe as refactoring application code
|
||||
- The compiler verifies that your platform can actually fulfill your requirements
|
||||
|
||||
See [ADR-001 · Why Rust](./adr/001-rust.md) for our full rationale.
|
||||
|
||||
---
|
||||
|
||||
## Architecture Decisions
|
||||
|
||||
Harmony's design is documented through Architecture Decision Records (ADRs):
|
||||
|
||||
- [ADR-001 · Why Rust](./adr/001-rust.md)
|
||||
- [ADR-003 · Infrastructure Abstractions](./adr/003-infrastructure-abstractions.md)
|
||||
- [ADR-006 · Secret Management](./adr/006-secret-management.md)
|
||||
- [ADR-011 · Multi-Tenant Cluster](./adr/011-multi-tenant-cluster.md)
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
Harmony is released under the **GNU AGPL v3**.
|
||||
|
||||
> We choose a strong copyleft license to ensure the project—and every improvement to it—remains open and benefits the entire community. Fork it, enhance it, even out-innovate us; just keep it open.
|
||||
> We choose a strong copyleft license to ensure the project—and every improvement to it—remains open and benefits the entire community.
|
||||
|
||||
See [LICENSE](LICENSE) for the full text.
|
||||
|
||||
---
|
||||
|
||||
_Made with ❤️ & 🦀 by the NationTech and the Harmony community_
|
||||
_Made with ❤️ & 🦀 by NationTech and the Harmony community_
|
||||
|
||||
9
book.toml
Normal file
9
book.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[book]
|
||||
title = "Harmony"
|
||||
description = "Infrastructure orchestration that treats your platform like first-class code"
|
||||
src = "docs"
|
||||
build-dir = "book"
|
||||
authors = ["NationTech"]
|
||||
|
||||
[output.html]
|
||||
mathjax-support = false
|
||||
11
build/book.sh
Executable file
11
build/book.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
cargo install mdbook --locked
|
||||
mdbook build
|
||||
|
||||
test -f book/index.html || (echo "ERROR: book/index.html not found" && exit 1)
|
||||
test -f book/concepts.html || (echo "ERROR: book/concepts.html not found" && exit 1)
|
||||
test -f book/guides/getting-started.html || (echo "ERROR: book/guides/getting-started.html not found" && exit 1)
|
||||
@@ -1,6 +1,8 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
rustc --version
|
||||
cargo check --all-targets --all-features --keep-going
|
||||
cargo fmt --check
|
||||
16
build/ci.sh
Executable file
16
build/ci.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
BRANCH="${1:-main}"
|
||||
|
||||
echo "=== Running CI for branch: $BRANCH ==="
|
||||
|
||||
echo "--- Checking code ---"
|
||||
./build/check.sh
|
||||
|
||||
echo "--- Building book ---"
|
||||
./build/book.sh
|
||||
|
||||
echo "=== CI passed ==="
|
||||
@@ -13,8 +13,8 @@ If you're new to Harmony, start here:
|
||||
|
||||
See how to use Harmony to solve real-world problems.
|
||||
|
||||
- [**PostgreSQL on Local K3D**](./use-cases/postgresql-on-local-k3d.md): Deploy a production-grade PostgreSQL cluster on a local K3D cluster. The fastest way to get started.
|
||||
- [**OKD on Bare Metal**](./use-cases/okd-on-bare-metal.md): A detailed walkthrough of bootstrapping a high-availability OKD cluster from physical hardware.
|
||||
- [**Deploy a Rust Web App**](./use-cases/deploy-rust-webapp.md): A quick guide to deploying a monitored, containerized web application to a Kubernetes cluster.
|
||||
|
||||
## 3. Component Catalogs
|
||||
|
||||
@@ -31,3 +31,7 @@ Ready to build your own components? These guides show you how.
|
||||
- [**Writing a Score**](./guides/writing-a-score.md): Learn how to create your own `Score` and `Interpret` logic to define a new desired state.
|
||||
- [**Writing a Topology**](./guides/writing-a-topology.md): Learn how to model a new environment (like AWS, GCP, or custom hardware) as a `Topology`.
|
||||
- [**Adding Capabilities**](./guides/adding-capabilities.md): See how to add a `Capability` to your custom `Topology`.
|
||||
|
||||
## 5. Architecture Decision Records
|
||||
|
||||
Harmony's design is documented through Architecture Decision Records (ADRs). See the [ADR Overview](./adr/README.md) for a complete index of all decisions.
|
||||
|
||||
53
docs/SUMMARY.md
Normal file
53
docs/SUMMARY.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Summary
|
||||
|
||||
[Harmony Documentation](./README.md)
|
||||
|
||||
- [Core Concepts](./concepts.md)
|
||||
- [Getting Started Guide](./guides/getting-started.md)
|
||||
|
||||
## Use Cases
|
||||
|
||||
- [PostgreSQL on Local K3D](./use-cases/postgresql-on-local-k3d.md)
|
||||
- [OKD on Bare Metal](./use-cases/okd-on-bare-metal.md)
|
||||
|
||||
## Component Catalogs
|
||||
|
||||
- [Scores Catalog](./catalogs/scores.md)
|
||||
- [Topologies Catalog](./catalogs/topologies.md)
|
||||
- [Capabilities Catalog](./catalogs/capabilities.md)
|
||||
|
||||
## Developer Guides
|
||||
|
||||
- [Developer Guide](./guides/developer-guide.md)
|
||||
- [Writing a Score](./guides/writing-a-score.md)
|
||||
- [Writing a Topology](./guides/writing-a-topology.md)
|
||||
- [Adding Capabilities](./guides/adding-capabilities.md)
|
||||
|
||||
## Configuration
|
||||
|
||||
- [Configuration](./concepts/configuration.md)
|
||||
|
||||
## Architecture Decision Records
|
||||
|
||||
- [ADR Overview](./adr/README.md)
|
||||
- [000 · ADR Template](./adr/000-ADR-Template.md)
|
||||
- [001 · Why Rust](./adr/001-rust.md)
|
||||
- [002 · Hexagonal Architecture](./adr/002-hexagonal-architecture.md)
|
||||
- [003 · Infrastructure Abstractions](./adr/003-infrastructure-abstractions.md)
|
||||
- [004 · iPXE](./adr/004-ipxe.md)
|
||||
- [005 · Interactive Project](./adr/005-interactive-project.md)
|
||||
- [006 · Secret Management](./adr/006-secret-management.md)
|
||||
- [007 · Default Runtime](./adr/007-default-runtime.md)
|
||||
- [008 · Score Display Formatting](./adr/008-score-display-formatting.md)
|
||||
- [009 · Helm and Kustomize Handling](./adr/009-helm-and-kustomize-handling.md)
|
||||
- [010 · Monitoring and Alerting](./adr/010-monitoring-and-alerting.md)
|
||||
- [011 · Multi-Tenant Cluster](./adr/011-multi-tenant-cluster.md)
|
||||
- [012 · Project Delivery Automation](./adr/012-project-delivery-automation.md)
|
||||
- [013 · Monitoring Notifications](./adr/013-monitoring-notifications.md)
|
||||
- [015 · Higher Order Topologies](./adr/015-higher-order-topologies.md)
|
||||
- [016 · Harmony Agent and Global Mesh](./adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md)
|
||||
- [017-1 · NATS Clusters Interconnection](./adr/017-1-Nats-Clusters-Interconnection-Topology.md)
|
||||
- [018 · Template Hydration for Workload Deployment](./adr/018-Template-Hydration-For-Workload-Deployment.md)
|
||||
- [019 · Network Bond Setup](./adr/019-Network-bond-setup.md)
|
||||
- [020 · Interactive Configuration Crate](./adr/020-interactive-configuration-crate.md)
|
||||
- [020-1 · Zitadel + OpenBao Secure Config Store](./adr/020-1-zitadel-openbao-secure-config-store.md)
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
Rejected : See ADR 020 ./020-interactive-configuration-crate.md
|
||||
|
||||
### TODO [#3](https://git.nationtech.io/NationTech/harmony/issues/3):
|
||||
|
||||
233
docs/adr/020-1-zitadel-openbao-secure-config-store.md
Normal file
233
docs/adr/020-1-zitadel-openbao-secure-config-store.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# ADR 020-1: Zitadel OIDC and OpenBao Integration for the Config Store
|
||||
|
||||
Author: Jean-Gabriel Gill-Couture
|
||||
|
||||
Date: 2026-03-18
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
|
||||
ADR 020 defines a unified `harmony_config` crate with a `ConfigStore` trait. The default team-oriented backend is OpenBao, which provides encrypted storage, versioned KV, audit logging, and fine-grained access control.
|
||||
|
||||
OpenBao requires authentication. The question is how developers authenticate without introducing new credentials to manage.
|
||||
|
||||
The goals are:
|
||||
|
||||
- **Zero new credentials.** Developers log in with their existing corporate identity (Google Workspace, GitHub, or Microsoft Entra ID / Azure AD).
|
||||
- **Headless compatibility.** The flow must work over SSH, inside containers, and in CI — environments with no browser or localhost listener.
|
||||
- **Minimal friction.** After a one-time login, authentication should be invisible for weeks of active use.
|
||||
- **Centralized offboarding.** Revoking a user in the identity provider must immediately revoke their access to the config store.
|
||||
|
||||
## Decision
|
||||
|
||||
Developers authenticate to OpenBao through a two-step process: first, they obtain an OIDC token from Zitadel (`sso.nationtech.io`) using the OAuth 2.0 Device Authorization Grant (RFC 8628); then, they exchange that token for a short-lived OpenBao client token via OpenBao's JWT auth method.
|
||||
|
||||
### The authentication flow
|
||||
|
||||
#### Step 1: Trigger
|
||||
|
||||
The `ConfigManager` attempts to resolve a value via the `StoreSource`. The `StoreSource` checks for a cached OpenBao token in `~/.local/share/harmony/session.json`. If the token is missing or expired, authentication begins.
|
||||
|
||||
#### Step 2: Device Authorization Request
|
||||
|
||||
Harmony sends a `POST` to Zitadel's device authorization endpoint:
|
||||
|
||||
```
|
||||
POST https://sso.nationtech.io/oauth/v2/device_authorization
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
|
||||
client_id=<harmony_client_id>&scope=openid email profile offline_access
|
||||
```
|
||||
|
||||
Zitadel responds with:
|
||||
|
||||
```json
|
||||
{
|
||||
"device_code": "dOcbPeysDhT26ZatRh9n7Q",
|
||||
"user_code": "GQWC-FWFK",
|
||||
"verification_uri": "https://sso.nationtech.io/device",
|
||||
"verification_uri_complete": "https://sso.nationtech.io/device?user_code=GQWC-FWFK",
|
||||
"expires_in": 300,
|
||||
"interval": 5
|
||||
}
|
||||
```
|
||||
|
||||
#### Step 3: User prompt
|
||||
|
||||
Harmony prints the code and URL to the terminal:
|
||||
|
||||
```
|
||||
[Harmony] To authenticate, open your browser to:
|
||||
https://sso.nationtech.io/device
|
||||
and enter code: GQWC-FWFK
|
||||
|
||||
Or visit: https://sso.nationtech.io/device?user_code=GQWC-FWFK
|
||||
```
|
||||
|
||||
If a desktop environment is detected, Harmony also calls `open` / `xdg-open` to launch the browser automatically. The `verification_uri_complete` URL pre-fills the code, so the user only needs to click "Confirm" after logging in.
|
||||
|
||||
There is no localhost HTTP listener. The CLI does not need to bind a port or receive a callback. This is what makes the device flow work over SSH, in containers, and through corporate firewalls — unlike the `oc login` approach which spins up a temporary web server to catch a redirect.
|
||||
|
||||
#### Step 4: User login
|
||||
|
||||
The developer logs in through Zitadel's web UI using one of the configured identity providers:
|
||||
|
||||
- **Google Workspace** — for teams using Google as their corporate identity.
|
||||
- **GitHub** — for open-source or GitHub-centric teams.
|
||||
- **Microsoft Entra ID (Azure AD)** — for enterprise clients, particularly common in Quebec and the broader Canadian public sector.
|
||||
|
||||
Zitadel federates the login to the chosen provider. The developer authenticates with their existing corporate credentials. No new password is created.
|
||||
|
||||
#### Step 5: Polling
|
||||
|
||||
While the user is authenticating in the browser, Harmony polls Zitadel's token endpoint at the interval specified in the device authorization response (typically 5 seconds):
|
||||
|
||||
```
|
||||
POST https://sso.nationtech.io/oauth/v2/token
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
|
||||
grant_type=urn:ietf:params:oauth:grant-type:device_code
|
||||
&device_code=dOcbPeysDhT26ZatRh9n7Q
|
||||
&client_id=<harmony_client_id>
|
||||
```
|
||||
|
||||
Before the user completes login, Zitadel responds with `authorization_pending`. Once the user consents, Zitadel returns:
|
||||
|
||||
```json
|
||||
{
|
||||
"access_token": "...",
|
||||
"token_type": "Bearer",
|
||||
"expires_in": 3600,
|
||||
"refresh_token": "...",
|
||||
"id_token": "eyJhbGciOiJSUzI1NiIs..."
|
||||
}
|
||||
```
|
||||
|
||||
The `scope=offline_access` in the initial request is what causes Zitadel to issue a `refresh_token`.
|
||||
|
||||
#### Step 6: OpenBao JWT exchange
|
||||
|
||||
Harmony sends the `id_token` (a JWT signed by Zitadel) to OpenBao's JWT auth method:
|
||||
|
||||
```
|
||||
POST https://secrets.nationtech.io/v1/auth/jwt/login
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"role": "harmony-developer",
|
||||
"jwt": "eyJhbGciOiJSUzI1NiIs..."
|
||||
}
|
||||
```
|
||||
|
||||
OpenBao validates the JWT:
|
||||
|
||||
1. It fetches Zitadel's public keys from `https://sso.nationtech.io/oauth/v2/keys` (the JWKS endpoint).
|
||||
2. It verifies the JWT signature.
|
||||
3. It reads the claims (`email`, `groups`, and any custom claims mapped from the upstream identity provider, such as Azure AD tenant or Google Workspace org).
|
||||
4. It evaluates the claims against the `bound_claims` and `bound_audiences` configured on the `harmony-developer` role.
|
||||
5. If validation passes, OpenBao returns a client token:
|
||||
|
||||
```json
|
||||
{
|
||||
"auth": {
|
||||
"client_token": "hvs.CAES...",
|
||||
"policies": ["harmony-dev"],
|
||||
"metadata": { "role": "harmony-developer" },
|
||||
"lease_duration": 14400,
|
||||
"renewable": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Harmony caches the OpenBao token, the OIDC refresh token, and the token expiry timestamps to `~/.local/share/harmony/session.json` with `0600` file permissions.
|
||||
|
||||
### OpenBao storage structure
|
||||
|
||||
All configuration and secret state is stored in an OpenBao Versioned KV v2 engine.
|
||||
|
||||
Path taxonomy:
|
||||
|
||||
```
|
||||
harmony/<organization>/<project>/<environment>/<key>
|
||||
```
|
||||
|
||||
Examples:
|
||||
|
||||
```
|
||||
harmony/nationtech/my-app/staging/PostgresConfig
|
||||
harmony/nationtech/my-app/production/PostgresConfig
|
||||
harmony/nationtech/my-app/local-shared/PostgresConfig
|
||||
```
|
||||
|
||||
The `ConfigClass` (Standard vs. Secret) can influence OpenBao policy structure — for example, `Secret`-class paths could require stricter ACLs or additional audit backends — but the path taxonomy itself does not change. This is an operational concern configured in OpenBao policies, not a structural one enforced by path naming.
|
||||
|
||||
### Token lifecycle and silent refresh
|
||||
|
||||
The system manages three tokens with different lifetimes:
|
||||
|
||||
| Token | TTL | Max TTL | Purpose |
|
||||
|---|---|---|---|
|
||||
| OpenBao client token | 4 hours | 24 hours | Read/write config store |
|
||||
| OIDC ID token | 1 hour | — | Exchange for OpenBao token |
|
||||
| OIDC refresh token | 90 days absolute, 30 days inactivity | — | Obtain new ID tokens silently |
|
||||
|
||||
The refresh flow, from the developer's perspective:
|
||||
|
||||
1. **Same session (< 4 hours since last use).** The cached OpenBao token is still valid. No network call to Zitadel. Fastest path.
|
||||
2. **Next day (OpenBao token expired, refresh token valid).** Harmony uses the OIDC `refresh_token` to request a new `id_token` from Zitadel's token endpoint (`grant_type=refresh_token`). It then exchanges the new `id_token` for a fresh OpenBao token. This happens silently. The developer sees no prompt.
|
||||
3. **OpenBao token near max TTL (approaching 24 hours of cumulative renewals).** Instead of renewing, Harmony re-authenticates using the refresh token to get a completely fresh OpenBao token. Transparent to the user.
|
||||
4. **After 30 days of inactivity.** The OIDC refresh token expires. Harmony falls back to the device flow (Step 2 above) and prompts the user to re-authenticate in the browser. This is the only scenario where a returning developer sees a login prompt.
|
||||
5. **User offboarded.** An administrator revokes the user's account or group membership in Zitadel. The next time the refresh token is used, Zitadel rejects it. The device flow also fails because the user can no longer authenticate. Access is terminated without any action needed on the OpenBao side.
|
||||
|
||||
OpenBao token renewal uses the `/auth/token/renew-self` endpoint with the `X-Vault-Token` header. Harmony renews proactively at ~75% of the TTL to avoid race conditions.
|
||||
|
||||
### OpenBao role configuration
|
||||
|
||||
The OpenBao JWT auth role for Harmony developers:
|
||||
|
||||
```bash
|
||||
bao write auth/jwt/config \
|
||||
oidc_discovery_url="https://sso.nationtech.io" \
|
||||
bound_issuer="https://sso.nationtech.io"
|
||||
|
||||
bao write auth/jwt/role/harmony-developer \
|
||||
role_type="jwt" \
|
||||
bound_audiences="<harmony_client_id>" \
|
||||
user_claim="email" \
|
||||
groups_claim="urn:zitadel:iam:org:project:roles" \
|
||||
policies="harmony-dev" \
|
||||
ttl="4h" \
|
||||
max_ttl="24h" \
|
||||
token_type="service"
|
||||
```
|
||||
|
||||
The `bound_audiences` claim ties the role to the specific Harmony Zitadel application. The `groups_claim` allows mapping Zitadel project roles to OpenBao policies for per-team or per-project access control.
|
||||
|
||||
### Self-hosted deployments
|
||||
|
||||
For organizations running their own infrastructure, the same architecture applies. The operator deploys Zitadel and OpenBao using Harmony's existing `ZitadelScore` and `OpenbaoScore`. The only configuration needed is three environment variables (or their equivalents in the bootstrap config):
|
||||
|
||||
- `HARMONY_SSO_URL` — the Zitadel instance URL.
|
||||
- `HARMONY_SECRETS_URL` — the OpenBao instance URL.
|
||||
- `HARMONY_SSO_CLIENT_ID` — the Zitadel application client ID.
|
||||
|
||||
None of these are secrets. They can be committed to an infrastructure repository or distributed via any convenient channel.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
- Developers authenticate with existing corporate credentials. No new passwords, no static tokens to distribute.
|
||||
- The device flow works in every environment: local terminal, SSH, containers, CI runners, corporate VPNs.
|
||||
- Silent token refresh keeps developers authenticated for weeks without any manual intervention.
|
||||
- User offboarding is a single action in Zitadel. No OpenBao token rotation or manual revocation required.
|
||||
- Azure AD / Microsoft Entra ID support addresses the enterprise and public sector market.
|
||||
|
||||
### Negative
|
||||
|
||||
- The OAuth state machine (device code polling, token refresh, error handling) adds implementation complexity compared to a static token approach.
|
||||
- Developers must have network access to `sso.nationtech.io` and `secrets.nationtech.io` to pull or push configuration state. True offline work falls back to the local file store, which does not sync with the team.
|
||||
- The first login per machine requires a browser interaction. Fully headless first-run scenarios (e.g., a fresh CI runner with no pre-seeded tokens) must use `EnvSource` overrides or a service account JWT.
|
||||
177
docs/adr/020-interactive-configuration-crate.md
Normal file
177
docs/adr/020-interactive-configuration-crate.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# ADR 020: Unified Configuration and Secret Management
|
||||
|
||||
Author: Jean-Gabriel Gill-Couture
|
||||
|
||||
Date: 2026-03-18
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
|
||||
Harmony's orchestration logic depends on runtime data that falls into two categories:
|
||||
|
||||
1. **Secrets** — credentials, tokens, private keys.
|
||||
2. **Operational configuration** — deployment targets, host selections, port assignments, reboot decisions, and similar contextual choices.
|
||||
|
||||
Both categories share the same fundamental lifecycle: a value must be acquired before execution can proceed, it may come from several backends (environment variable, remote store, interactive prompt), and it must be shareable across a team without polluting the Git repository.
|
||||
|
||||
Treating these categories as separate subsystems forces developers to choose between a "config API" and a "secret API" at every call site. The only meaningful difference between the two is how the storage backend handles the data (plaintext vs. encrypted, audited vs. unaudited) and how the CLI displays it (visible vs. masked). That difference belongs in the backend, not in the application code.
|
||||
|
||||
Three concrete problems drive this change:
|
||||
|
||||
- **Async terminal corruption.** `inquire` prompts assume exclusive terminal ownership. Background tokio tasks emitting log output during a prompt corrupt the terminal state. This is inherent to Harmony's concurrent orchestration model.
|
||||
- **Untestable code paths.** Any function containing an inline `inquire` call requires a real TTY to execute. Unit testing is impossible without ignoring the test entirely.
|
||||
- **No backend integration.** Inline prompts cannot be answered from a remote store, an environment variable, or a CI pipeline. Every automated deployment that passes through a prompting code path requires a human operator at a terminal.
|
||||
|
||||
## Decision
|
||||
|
||||
A single workspace crate, `harmony_config`, provides all configuration and secret acquisition for Harmony. It replaces both `harmony_secret` and all inline `inquire` usage.
|
||||
|
||||
### Schema in Git, state in the store
|
||||
|
||||
The Rust type system serves as the configuration schema. Developers declare what configuration is needed by defining structs:
|
||||
|
||||
```rust
|
||||
#[derive(Config, Serialize, Deserialize, JsonSchema, InteractiveParse)]
|
||||
struct PostgresConfig {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
These structs live in Git and evolve with the code. When a branch introduces a new field, Git tracks that schema change. The actual values live in an external store — OpenBao by default. No `.env` files, no JSON config files, no YAML in the repository.
|
||||
|
||||
### Data classification
|
||||
|
||||
```rust
|
||||
/// Tells the storage backend how to handle the data.
|
||||
pub enum ConfigClass {
|
||||
/// Plaintext storage is acceptable.
|
||||
Standard,
|
||||
/// Must be encrypted at rest, masked in UI, subject to audit logging.
|
||||
Secret,
|
||||
}
|
||||
```
|
||||
|
||||
Classification is determined at the struct level. A struct with no `#[config(secret)]` fields has `ConfigClass::Standard`. A struct with one or more `#[config(secret)]` fields is elevated to `ConfigClass::Secret`. The struct is always stored as a single cohesive JSON blob; field-level splitting across backends is not a concern of the trait.
|
||||
|
||||
The `#[config(secret)]` attribute also instructs the `PromptSource` to mask terminal input for that field during interactive prompting.
|
||||
|
||||
### The Config trait
|
||||
|
||||
```rust
|
||||
pub trait Config: Serialize + DeserializeOwned + JsonSchema + InteractiveParseObj + Sized {
|
||||
/// Stable lookup key. By default, the struct name.
|
||||
const KEY: &'static str;
|
||||
|
||||
/// How the backend should treat this data.
|
||||
const CLASS: ConfigClass;
|
||||
}
|
||||
```
|
||||
|
||||
A `#[derive(Config)]` proc macro generates the implementation. The macro inspects field attributes to determine `CLASS`.
|
||||
|
||||
### The ConfigStore trait
|
||||
|
||||
```rust
|
||||
#[async_trait]
|
||||
pub trait ConfigStore: Send + Sync {
|
||||
async fn get(
|
||||
&self,
|
||||
class: ConfigClass,
|
||||
namespace: &str,
|
||||
key: &str,
|
||||
) -> Result<Option<serde_json::Value>, ConfigError>;
|
||||
|
||||
async fn set(
|
||||
&self,
|
||||
class: ConfigClass,
|
||||
namespace: &str,
|
||||
key: &str,
|
||||
value: &serde_json::Value,
|
||||
) -> Result<(), ConfigError>;
|
||||
}
|
||||
```
|
||||
|
||||
The `class` parameter is a hint. The store implementation decides what to do with it. An OpenBao store may route `Secret` data to a different path prefix or apply stricter ACLs. A future store could split fields across backends — that is an implementation concern, not a trait concern.
|
||||
|
||||
### Resolution chain
|
||||
|
||||
The `ConfigManager` tries sources in priority order:
|
||||
|
||||
1. **`EnvSource`** — reads `HARMONY_CONFIG_{KEY}` as a JSON string. Override hatch for CI/CD pipelines and containerized environments.
|
||||
2. **`StoreSource`** — wraps a `ConfigStore` implementation. For teams, this is the OpenBao backend authenticated via Zitadel OIDC (see ADR 020-1).
|
||||
3. **`PromptSource`** — presents an `interactive-parse` prompt on the terminal. Acquires a process-wide async mutex before rendering to prevent log output corruption.
|
||||
|
||||
When `PromptSource` obtains a value, the `ConfigManager` persists it back to the `StoreSource` so that subsequent runs — by the same developer or any teammate — resolve without prompting.
|
||||
|
||||
Callers that do not include `PromptSource` in their source list never block on a TTY. Test code passes empty source lists and constructs config structs directly.
|
||||
|
||||
### Schema versioning
|
||||
|
||||
The Rust struct is the schema. When a developer renames a field, removes a field, or changes a type on a branch, the store may still contain data shaped for a previous version of the struct. If another team member who does not yet have that commit runs the code, `serde_json::from_value` will fail on the stale entry.
|
||||
|
||||
In the initial implementation, the resolution chain handles this gracefully: a deserialization failure is treated as a cache miss, and the `PromptSource` fires. The prompted value overwrites the stale entry in the store.
|
||||
|
||||
This is sufficient for small teams working on short-lived branches. It is not sufficient at scale, where silent re-prompting could mask real configuration drift.
|
||||
|
||||
A future iteration will introduce a compile-time schema migration mechanism, similar to how `sqlx` verifies queries against a live database at compile time. The mechanism will:
|
||||
|
||||
- Detect schema drift between the Rust struct and the stored JSON.
|
||||
- Apply named, ordered migration functions to transform stored data forward.
|
||||
- Reject ambiguous migrations at compile time rather than silently corrupting state.
|
||||
|
||||
Until that mechanism exists, teams should treat store entries as soft caches: the struct definition is always authoritative, and the store is best-effort.
|
||||
|
||||
## Rationale
|
||||
|
||||
**Why merge secrets and config into one crate?** Separate crates with nearly identical trait shapes (`Secret` vs `Config`, `SecretStore` vs `ConfigStore`) force developers to make a classification decision at every call site. A unified crate with a `ConfigClass` discriminator moves that decision to the struct definition, where it belongs.
|
||||
|
||||
**Why OpenBao as the default backend?** OpenBao is a fully open-source Vault fork under the Linux Foundation. It runs on-premises with no phone-home requirement — a hard constraint for private cloud and regulated environments. Harmony already deploys OpenBao for clients (`OpenbaoScore`), so no new infrastructure is introduced.
|
||||
|
||||
**Why not store values in Git (e.g., encrypted YAML)?** Git-tracked config files create merge conflicts, require re-encryption on team membership changes, and leak metadata (file names, key names) even when values are encrypted. Storing state in OpenBao avoids all of these issues and provides audit logging, access control, and versioned KV out of the box.
|
||||
|
||||
**Why keep `PromptSource`?** Removing interactive prompts entirely would break the zero-infrastructure bootstrapping path and eliminate human-confirmation safety gates for destructive operations (interface reconfiguration, node reboot). The problem was never that prompts exist — it is that they were unavoidable and untestable. Making `PromptSource` an explicit, opt-in entry in the source list restores control.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
- A single API surface for all runtime data acquisition.
|
||||
- All currently-ignored tests become runnable without TTY access.
|
||||
- Async terminal corruption is eliminated by the process-wide prompt mutex.
|
||||
- The bootstrapping path requires no infrastructure for a first run; `PromptSource` alone is sufficient.
|
||||
- The team path (OpenBao + Zitadel) reuses infrastructure Harmony already deploys.
|
||||
- User offboarding is a single Zitadel action.
|
||||
|
||||
### Negative
|
||||
|
||||
- Migrating all inline `inquire` and `harmony_secret` call sites is a significant refactoring effort.
|
||||
- Until the schema migration mechanism is built, store entries for renamed or removed fields become stale and must be re-prompted.
|
||||
- The Zitadel device flow introduces a browser step on first login per machine.
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Trait design and crate restructure
|
||||
|
||||
Refactor `harmony_config` to define the final `Config`, `ConfigClass`, and `ConfigStore` traits. Update the derive macro to support `#[config(secret)]` and generate the correct `CLASS` constant. Implement `EnvSource` and `PromptSource` against the new traits. Write comprehensive unit tests using mock stores.
|
||||
|
||||
### Phase 2: Absorb `harmony_secret`
|
||||
|
||||
Migrate the `OpenbaoSecretStore`, `InfisicalSecretStore`, and `LocalFileSecretStore` implementations from `harmony_secret` into `harmony_config` as `ConfigStore` backends. Update all call sites that use `SecretManager::get`, `SecretManager::get_or_prompt`, or `SecretManager::set` to use `harmony_config` equivalents.
|
||||
|
||||
### Phase 3: Migrate inline prompts
|
||||
|
||||
Replace all inline `inquire` call sites in the `harmony` crate (`infra/brocade.rs`, `infra/network_manager.rs`, `modules/okd/host_network.rs`, and others) with `harmony_config` structs and `get_or_prompt` calls. Un-ignore the affected tests.
|
||||
|
||||
### Phase 4: Zitadel and OpenBao integration
|
||||
|
||||
Implement the authentication flow described in ADR 020-1. Wire `StoreSource` to use Zitadel OIDC tokens for OpenBao access. Implement token caching and silent refresh.
|
||||
|
||||
### Phase 5: Remove `harmony_secret`
|
||||
|
||||
Delete the `harmony_secret` and `harmony_secret_derive` crates from the workspace. All functionality now lives in `harmony_config`.
|
||||
63
docs/adr/README.md
Normal file
63
docs/adr/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Architecture Decision Records
|
||||
|
||||
An Architecture Decision Record (ADR) documents a significant architectural decision made during the development of Harmony — along with its context, rationale, and consequences.
|
||||
|
||||
## Why We Use ADRs
|
||||
|
||||
As a platform engineering framework used by a team, Harmony accumulates technical decisions over time. ADRs help us:
|
||||
|
||||
- **Track rationale** — understand _why_ a decision was made, not just _what_ was decided
|
||||
- ** onboard new contributors** — the "why" is preserved even when team membership changes
|
||||
- **Avoid repeating past mistakes** — previous decisions and their context are searchable
|
||||
- **Manage technical debt** — ADRs make it easier to revisit and revise past choices
|
||||
|
||||
An ADR captures a decision at a point in time. It is not a specification — it is a record of reasoning.
|
||||
|
||||
## ADR Format
|
||||
|
||||
Every ADR follows this structure:
|
||||
|
||||
| Section | Purpose |
|
||||
|---------|---------|
|
||||
| **Status** | Proposed / Pending / Accepted / Implemented / Deprecated |
|
||||
| **Context** | The problem or background — the "why" behind this decision |
|
||||
| **Decision** | The chosen solution or direction |
|
||||
| **Rationale** | Reasoning behind the decision |
|
||||
| **Consequences** | Both positive and negative outcomes |
|
||||
| **Alternatives considered** | Other options that were evaluated |
|
||||
| **Additional Notes** | Supplementary context, links, or open questions |
|
||||
|
||||
## ADR Index
|
||||
|
||||
| Number | Title | Status |
|
||||
|--------|-------|--------|
|
||||
| [000](./000-ADR-Template.md) | ADR Template | Reference |
|
||||
| [001](./001-rust.md) | Why Rust | Accepted |
|
||||
| [002](./002-hexagonal-architecture.md) | Hexagonal Architecture | Accepted |
|
||||
| [003](./003-infrastructure-abstractions.md) | Infrastructure Abstractions | Accepted |
|
||||
| [004](./004-ipxe.md) | iPXE | Accepted |
|
||||
| [005](./005-interactive-project.md) | Interactive Project | Proposed |
|
||||
| [006](./006-secret-management.md) | Secret Management | Accepted |
|
||||
| [007](./007-default-runtime.md) | Default Runtime | Accepted |
|
||||
| [008](./008-score-display-formatting.md) | Score Display Formatting | Proposed |
|
||||
| [009](./009-helm-and-kustomize-handling.md) | Helm and Kustomize Handling | Accepted |
|
||||
| [010](./010-monitoring-and-alerting.md) | Monitoring and Alerting | Accepted |
|
||||
| [011](./011-multi-tenant-cluster.md) | Multi-Tenant Cluster | Accepted |
|
||||
| [012](./012-project-delivery-automation.md) | Project Delivery Automation | Proposed |
|
||||
| [013](./013-monitoring-notifications.md) | Monitoring Notifications | Accepted |
|
||||
| [015](./015-higher-order-topologies.md) | Higher Order Topologies | Proposed |
|
||||
| [016](./016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md) | Harmony Agent and Global Mesh | Proposed |
|
||||
| [017-1](./017-1-Nats-Clusters-Interconnection-Topology.md) | NATS Clusters Interconnection Topology | Proposed |
|
||||
| [018](./018-Template-Hydration-For-Workload-Deployment.md) | Template Hydration for Workload Deployment | Proposed |
|
||||
| [019](./019-Network-bond-setup.md) | Network Bond Setup | Proposed |
|
||||
| [020-1](./020-1-zitadel-openbao-secure-config-store.md) | Zitadel + OpenBao Secure Config Store | Accepted |
|
||||
| [020](./020-interactive-configuration-crate.md) | Interactive Configuration Crate | Proposed |
|
||||
|
||||
## Contributing
|
||||
|
||||
When making a significant technical change:
|
||||
|
||||
1. **Check existing ADRs** — the decision may already be documented
|
||||
2. **Create a new ADR** using the [template](./000-ADR-Template.md) if the change warrants architectural discussion
|
||||
3. **Set status to Proposed** and open it for team review
|
||||
4. Once accepted and implemented, update the status accordingly
|
||||
@@ -84,7 +84,7 @@ Network services that run inside the cluster or as part of the topology.
|
||||
- **OKDLoadBalancerScore**: Configures the high-availability load balancers for the OKD API and ingress.
|
||||
- **OKDBootstrapLoadBalancerScore**: Configures the load balancer specifically for the bootstrap-time API endpoint.
|
||||
- **K8sIngressScore**: Configures an Ingress controller or resource.
|
||||
- [HighAvailabilityHostNetworkScore](../../harmony/src/modules/okd/host_network.rs): Configures network bonds on a host and the corresponding port-channels on the switch stack for high-availability.
|
||||
- **HighAvailabilityHostNetworkScore**: Configures network bonds on a host and the corresponding port-channels on the switch stack for high-availability.
|
||||
|
||||
## Tenant Management
|
||||
|
||||
|
||||
@@ -28,6 +28,11 @@ Harmony's design is based on a few key concepts. Understanding them is the key t
|
||||
- **What it is:** An **Inventory** is the physical material (the "what") used in a cluster. This is most relevant for bare-metal or on-premise topologies.
|
||||
- **Example:** A list of nodes with their roles (control plane, worker), CPU, RAM, and network interfaces. For the `K8sAnywhereTopology`, the inventory might be empty or autoloaded, as the infrastructure is more abstract.
|
||||
|
||||
### 6. Configuration & Secrets
|
||||
|
||||
- **What it is:** Configuration represents the runtime data required to deploy your `Scores`. This includes both non-sensitive state (like cluster hostnames, deployment profiles) and sensitive secrets (like API keys, database passwords).
|
||||
- **How it works:** See the [Configuration Concept Guide](./concepts/configuration.md) to understand Harmony's unified approach to managing schema in Git and state in OpenBao.
|
||||
|
||||
---
|
||||
|
||||
### How They Work Together (The Compile-Time Check)
|
||||
|
||||
107
docs/concepts/configuration.md
Normal file
107
docs/concepts/configuration.md
Normal file
@@ -0,0 +1,107 @@
|
||||
# Configuration and Secrets
|
||||
|
||||
Harmony treats configuration and secrets as a single concern. Developers use one crate, `harmony_config`, to declare, store, and retrieve all runtime data — whether it is a public hostname or a database password.
|
||||
|
||||
## The mental model: schema in Git, state in the store
|
||||
|
||||
### Schema
|
||||
|
||||
In Harmony, the Rust code is the configuration schema. You declare what your module needs by defining a struct:
|
||||
|
||||
```rust
|
||||
#[derive(Config, Serialize, Deserialize, JsonSchema, InteractiveParse)]
|
||||
struct PostgresConfig {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
This struct is tracked in Git. When a branch adds a new field, Git tracks that the branch requires a new value. When a branch removes a field, the old value in the store becomes irrelevant. The struct is always authoritative.
|
||||
|
||||
### State
|
||||
|
||||
The actual values live in a config store — by default, OpenBao. No `.env` files, no JSON, no YAML in the repository.
|
||||
|
||||
When you run your code, Harmony reads the struct (schema) and resolves values from the store (state):
|
||||
|
||||
- If the store has the value, it is injected seamlessly.
|
||||
- If the store does not have it, Harmony prompts you in the terminal. Your answer is pushed back to the store automatically.
|
||||
- When a teammate runs the same code, they are not prompted — you already provided the value.
|
||||
|
||||
### How branch switching works
|
||||
|
||||
Because the schema is just Rust code tracked in Git, branch switching works naturally:
|
||||
|
||||
1. You check out `feat/redis`. The code now requires `RedisConfig`.
|
||||
2. You run `cargo run`. Harmony detects that `RedisConfig` has no value in the store. It prompts you.
|
||||
3. You provide the values. Harmony pushes them to OpenBao.
|
||||
4. Your teammate checks out `feat/redis` and runs `cargo run`. No prompt — the values are already in the store.
|
||||
5. You switch back to `main`. `RedisConfig` does not exist in that branch's code. The store entry is ignored.
|
||||
|
||||
## Secrets vs. standard configuration
|
||||
|
||||
From your application code, there is no difference. You always call `harmony_config::get_or_prompt::<T>()`.
|
||||
|
||||
The difference is in the struct definition:
|
||||
|
||||
```rust
|
||||
// Standard config — stored in plaintext, displayed during prompting.
|
||||
#[derive(Config)]
|
||||
struct ClusterConfig {
|
||||
pub api_url: String,
|
||||
pub namespace: String,
|
||||
}
|
||||
|
||||
// Contains a secret field — the entire struct is stored encrypted,
|
||||
// and the password field is masked during terminal prompting.
|
||||
#[derive(Config)]
|
||||
struct DatabaseConfig {
|
||||
pub host: String,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
If a struct contains any `#[config(secret)]` field, Harmony elevates the entire struct to `ConfigClass::Secret`. The storage backend decides what that means in practice — in the case of OpenBao, it may route the data to a path with stricter ACLs or audit policies.
|
||||
|
||||
## Authentication and team sharing
|
||||
|
||||
Harmony uses Zitadel (hosted at `sso.nationtech.io`) for identity and OpenBao (hosted at `secrets.nationtech.io`) for storage.
|
||||
|
||||
**First run on a new machine:**
|
||||
|
||||
1. Harmony detects that you are not logged in.
|
||||
2. It prints a short code and URL to your terminal, and opens your browser if possible.
|
||||
3. You log in with your corporate identity (Google, GitHub, or Microsoft Entra ID / Azure AD).
|
||||
4. Harmony receives an OIDC token, exchanges it for an OpenBao token, and caches the session locally.
|
||||
|
||||
**Subsequent runs:**
|
||||
|
||||
- Harmony silently refreshes your tokens in the background. You do not need to log in again for up to 90 days of active use.
|
||||
- If you are inactive for 30 days, or if an administrator revokes your access in Zitadel, you will be prompted to re-authenticate.
|
||||
|
||||
**Offboarding:**
|
||||
|
||||
Revoking a user in Zitadel immediately invalidates their ability to refresh tokens or obtain new ones. No manual secret rotation is required.
|
||||
|
||||
## Resolution chain
|
||||
|
||||
When Harmony resolves a config value, it tries sources in order:
|
||||
|
||||
1. **Environment variable** (`HARMONY_CONFIG_{KEY}`) — highest priority. Use this in CI/CD to override any value without touching the store.
|
||||
2. **Config store** (OpenBao for teams, local file for solo/offline use) — the primary source for shared team state.
|
||||
3. **Interactive prompt** — last resort. Prompts the developer and persists the answer back to the store.
|
||||
|
||||
## Schema versioning
|
||||
|
||||
The Rust struct is the single source of truth for what configuration looks like. If a developer renames or removes a field on a branch, the store may still contain data shaped for the old version of the struct. When another developer who does not have that change runs the code, deserialization will fail.
|
||||
|
||||
In the current implementation, this is handled gracefully: a deserialization failure is treated as a miss, and Harmony re-prompts. The new answer overwrites the stale entry.
|
||||
|
||||
A compile-time migration mechanism is planned for a future release to handle this more rigorously at scale.
|
||||
|
||||
## Offline and local development
|
||||
|
||||
If you are working offline or evaluating Harmony without a team OpenBao instance, the `StoreSource` falls back to a local file store at `~/.local/share/harmony/config/`. The developer experience is identical — prompting, caching, and resolution all work the same way. The only difference is that the state is local to your machine and not shared with teammates.
|
||||
135
docs/guides/adding-capabilities.md
Normal file
135
docs/guides/adding-capabilities.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# Adding Capabilities
|
||||
|
||||
`Capabilities` are trait methods that a `Topology` exposes to Scores. They are the "how" — the specific APIs and features that let a Score translate intent into infrastructure actions.
|
||||
|
||||
## How Capabilities Work
|
||||
|
||||
When a Score declares it needs certain Capabilities:
|
||||
|
||||
```rust
|
||||
impl<T: Topology + K8sclient + HelmCommand> Score<T> for MyScore {
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
The compiler verifies that the target `Topology` implements both `K8sclient` and `HelmCommand`. If it doesn't, compilation fails. This is the compile-time safety check that prevents invalid configurations from reaching production.
|
||||
|
||||
## Built-in Capabilities
|
||||
|
||||
Harmony provides a set of standard Capabilities:
|
||||
|
||||
| Capability | What it provides |
|
||||
|------------|------------------|
|
||||
| `K8sclient` | A Kubernetes API client |
|
||||
| `HelmCommand` | A configured `helm` CLI invocation |
|
||||
| `TlsRouter` | TLS certificate management |
|
||||
| `NetworkManager` | Host network configuration |
|
||||
| `SwitchClient` | Network switch configuration |
|
||||
| `CertificateManagement` | Certificate issuance via cert-manager |
|
||||
|
||||
## Implementing a Capability
|
||||
|
||||
Capabilities are implemented as trait methods on your Topology:
|
||||
|
||||
```rust
|
||||
use std::sync::Arc;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony::topology::K8sclient;
|
||||
|
||||
pub struct MyTopology {
|
||||
kubeconfig: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl K8sclient for MyTopology {
|
||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||
let client = match &self.kubeconfig {
|
||||
Some(path) => K8sClient::from_kubeconfig(path).await?,
|
||||
None => K8sClient::try_default().await?,
|
||||
};
|
||||
Ok(Arc::new(client))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Adding a Custom Capability
|
||||
|
||||
For specialized infrastructure needs, add your own Capability as a trait:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use crate::executors::ExecutorError;
|
||||
|
||||
/// A capability for configuring network switches
|
||||
#[async_trait]
|
||||
pub trait SwitchClient: Send + Sync {
|
||||
async fn configure_port(
|
||||
&self,
|
||||
switch: &str,
|
||||
port: &str,
|
||||
vlan: u16,
|
||||
) -> Result<(), ExecutorError>;
|
||||
|
||||
async fn configure_port_channel(
|
||||
&self,
|
||||
switch: &str,
|
||||
name: &str,
|
||||
ports: &[&str],
|
||||
) -> Result<(), ExecutorError>;
|
||||
}
|
||||
```
|
||||
|
||||
Then implement it on your Topology:
|
||||
|
||||
```rust
|
||||
use harmony_infra::brocade::BrocadeClient;
|
||||
|
||||
pub struct MyTopology {
|
||||
switch_client: Arc<dyn SwitchClient>,
|
||||
}
|
||||
|
||||
impl SwitchClient for MyTopology {
|
||||
async fn configure_port(&self, switch: &str, port: &str, vlan: u16) -> Result<(), ExecutorError> {
|
||||
self.switch_client.configure_port(switch, port, vlan).await
|
||||
}
|
||||
|
||||
async fn configure_port_channel(&self, switch: &str, name: &str, ports: &[&str]) -> Result<(), ExecutorError> {
|
||||
self.switch_client.configure_port_channel(switch, name, ports).await
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Now Scores that need `SwitchClient` can run on `MyTopology`.
|
||||
|
||||
## Capability Composition
|
||||
|
||||
Topologies often compose multiple Capabilities to support complex Scores:
|
||||
|
||||
```rust
|
||||
pub struct HAClusterTopology {
|
||||
pub kubeconfig: Option<String>,
|
||||
pub router: Arc<dyn Router>,
|
||||
pub load_balancer: Arc<dyn LoadBalancer>,
|
||||
pub switch_client: Arc<dyn SwitchClient>,
|
||||
pub dhcp_server: Arc<dyn DhcpServer>,
|
||||
pub dns_server: Arc<dyn DnsServer>,
|
||||
// ...
|
||||
}
|
||||
|
||||
impl K8sclient for HAClusterTopology { ... }
|
||||
impl HelmCommand for HAClusterTopology { ... }
|
||||
impl SwitchClient for HAClusterTopology { ... }
|
||||
impl DhcpServer for HAClusterTopology { ... }
|
||||
impl DnsServer for HAClusterTopology { ... }
|
||||
impl Router for HAClusterTopology { ... }
|
||||
impl LoadBalancer for HAClusterTopology { ... }
|
||||
```
|
||||
|
||||
A Score that needs all of these can run on `HAClusterTopology` because the Topology provides all of them.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Keep Capabilities focused** — one Capability per concern (Kubernetes client, Helm, switch config)
|
||||
- **Return meaningful errors** — use specific error types so Scores can handle failures appropriately
|
||||
- **Make Capabilities optional where sensible** — not every Topology needs every Capability; use `Option<T>` or a separate trait for optional features
|
||||
- **Document preconditions** — if a Capability requires the infrastructure to be in a specific state, document it in the trait doc comments
|
||||
40
docs/guides/developer-guide.md
Normal file
40
docs/guides/developer-guide.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Developer Guide
|
||||
|
||||
This section covers how to extend Harmony by building your own `Score`, `Topology`, and `Capability` implementations.
|
||||
|
||||
## Writing a Score
|
||||
|
||||
A `Score` is a declarative description of desired state. To create your own:
|
||||
|
||||
1. Define a struct that represents your desired state
|
||||
2. Implement the `Score<T>` trait, where `T` is your target `Topology`
|
||||
3. Implement the `Interpret<T>` trait to define how the Score translates to infrastructure actions
|
||||
|
||||
See the [Writing a Score](./writing-a-score.md) guide for a step-by-step walkthrough.
|
||||
|
||||
## Writing a Topology
|
||||
|
||||
A `Topology` models your infrastructure environment. To create your own:
|
||||
|
||||
1. Define a struct that holds your infrastructure configuration
|
||||
2. Implement the `Topology` trait
|
||||
3. Implement the `Capability` traits your Score needs
|
||||
|
||||
See the [Writing a Topology](./writing-a-topology.md) guide for details.
|
||||
|
||||
## Adding Capabilities
|
||||
|
||||
`Capabilities` are the specific APIs or features a `Topology` exposes. They are the bridge between Scores and the actual infrastructure.
|
||||
|
||||
See the [Adding Capabilities](./adding-capabilities.md) guide for details on implementing and exposing Capabilities.
|
||||
|
||||
## Core Traits Reference
|
||||
|
||||
| Trait | Purpose |
|
||||
|-------|---------|
|
||||
| `Score<T>` | Declares desired state ("what") |
|
||||
| `Topology` | Represents infrastructure ("where") |
|
||||
| `Interpret<T>` | Execution logic ("how") |
|
||||
| `Capability` | A feature exposed by a Topology |
|
||||
|
||||
See [Core Concepts](../concepts.md) for the conceptual foundation.
|
||||
@@ -1,42 +1,230 @@
|
||||
# Getting Started Guide
|
||||
|
||||
Welcome to Harmony! This guide will walk you through installing the Harmony framework, setting up a new project, and deploying your first application.
|
||||
This guide walks you through deploying your first application with Harmony — a PostgreSQL cluster on a local Kubernetes cluster (K3D). By the end, you'll understand the core workflow: compile a Score, run it through the Harmony CLI, and verify the result.
|
||||
|
||||
We will build and deploy the "Rust Web App" example, which automatically:
|
||||
## What you'll deploy
|
||||
|
||||
1. Provisions a local K3D (Kubernetes in Docker) cluster.
|
||||
2. Deploys a sample Rust web application.
|
||||
3. Sets up monitoring for the application.
|
||||
A fully functional PostgreSQL cluster running in a local K3D cluster, managed by the CloudNativePG operator. This demonstrates the full Harmony pattern:
|
||||
|
||||
1. Provision a local Kubernetes cluster (K3D)
|
||||
2. Install the required operator (CloudNativePG)
|
||||
3. Create a PostgreSQL cluster
|
||||
4. Expose it as a Kubernetes Service
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, you'll need a few tools installed on your system:
|
||||
Before you begin, install the following tools:
|
||||
|
||||
- **Rust & Cargo:** [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
- **Docker:** [Install Docker](https://docs.docker.com/get-docker/) (Required for the K3D local cluster)
|
||||
- **kubectl:** [Install kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For inspecting the cluster)
|
||||
- **Rust & Cargo:** [Install Rust](https://rust-lang.org/tools/install) (edition 2024)
|
||||
- **Docker:** [Install Docker](https://docs.docker.com/get-docker/) (required for the local K3D cluster)
|
||||
- **kubectl:** [Install kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (optional, for inspecting the cluster)
|
||||
|
||||
## 1. Install Harmony
|
||||
|
||||
First, clone the Harmony repository and build the project. This gives you the `harmony` CLI and all the core libraries.
|
||||
## Step 1: Clone and build
|
||||
|
||||
```bash
|
||||
# Clone the main repository
|
||||
# Clone the repository
|
||||
git clone https://git.nationtech.io/nationtech/harmony
|
||||
cd harmony
|
||||
|
||||
# Build the project (this may take a few minutes)
|
||||
# Build the project (this may take a few minutes on first run)
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
...
|
||||
## Step 2: Run the PostgreSQL example
|
||||
|
||||
## Next Steps
|
||||
```bash
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
Congratulations, you've just deployed an application using true infrastructure-as-code!
|
||||
Harmony will output its progress as it:
|
||||
|
||||
From here, you can:
|
||||
1. **Creates a K3D cluster** named `harmony-postgres-example` (first run only)
|
||||
2. **Installs the CloudNativePG operator** into the cluster
|
||||
3. **Creates a PostgreSQL cluster** with 1 instance and 1 GiB of storage
|
||||
4. **Prints connection details** for your new database
|
||||
|
||||
- [Explore the Catalogs](../catalogs/README.md): See what other [Scores](../catalogs/scores.md) and [Topologies](../catalogs/topologies.md) are available.
|
||||
- [Read the Use Cases](../use-cases/README.md): Check out the [OKD on Bare Metal](./use-cases/okd-on-bare-metal.md) guide for a more advanced scenario.
|
||||
- [Write your own Score](../guides/writing-a-score.md): Dive into the [Developer Guide](./guides/developer-guide.md) to start building your own components.
|
||||
Expected output (abbreviated):
|
||||
|
||||
```
|
||||
[+] Cluster created
|
||||
[+] Installing CloudNativePG operator
|
||||
[+] Creating PostgreSQL cluster
|
||||
[+] PostgreSQL cluster is ready
|
||||
Namespace: harmony-postgres-example
|
||||
Service: harmony-postgres-example-rw
|
||||
Username: postgres
|
||||
Password: <stored in secret harmony-postgres-example-db-user>
|
||||
```
|
||||
|
||||
## Step 3: Verify the deployment
|
||||
|
||||
Check that the PostgreSQL pods are running:
|
||||
|
||||
```bash
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
harmony-postgres-example-1 1/1 Running 0 2m
|
||||
```
|
||||
|
||||
Get the database password:
|
||||
|
||||
```bash
|
||||
kubectl get secret -n harmony-postgres-example harmony-postgres-example-db-user -o jsonpath='{.data.password}' | base64 -d
|
||||
```
|
||||
|
||||
## Step 4: Connect to the database
|
||||
|
||||
Forward the PostgreSQL port to your local machine:
|
||||
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
```
|
||||
|
||||
In another terminal, connect with `psql`:
|
||||
|
||||
```bash
|
||||
psql -h localhost -p 5432 -U postgres
|
||||
# Enter the password from Step 4 when prompted
|
||||
```
|
||||
|
||||
Try a simple query:
|
||||
|
||||
```sql
|
||||
SELECT version();
|
||||
```
|
||||
|
||||
## Step 5: Clean up
|
||||
|
||||
To delete the PostgreSQL cluster and the local K3D cluster:
|
||||
|
||||
```bash
|
||||
k3d cluster delete harmony-postgres-example
|
||||
```
|
||||
|
||||
Alternatively, just delete the PostgreSQL cluster without removing K3D:
|
||||
|
||||
```bash
|
||||
kubectl delete namespace harmony-postgres-example
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
The example code (`examples/postgresql/src/main.rs`) is straightforward:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
- **`Inventory::autoload()`** discovers the local environment (or uses an existing inventory)
|
||||
- **`K8sAnywhereTopology::from_env()`** connects to K3D if `HARMONY_AUTOINSTALL=true` (the default), or to any Kubernetes cluster via `KUBECONFIG`
|
||||
- **`harmony_cli::run(...)`** executes the Score against the Topology, managing the full lifecycle
|
||||
|
||||
## Connecting to an existing cluster
|
||||
|
||||
By default, Harmony provisions a local K3D cluster. To use an existing Kubernetes cluster instead:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Docker is not running
|
||||
|
||||
```
|
||||
Error: could not create cluster: docker is not running
|
||||
```
|
||||
|
||||
Start Docker and try again.
|
||||
|
||||
### K3D cluster creation fails
|
||||
|
||||
```
|
||||
Error: failed to create k3d cluster
|
||||
```
|
||||
|
||||
Ensure you have at least 2 CPU cores and 4 GiB of RAM available for Docker.
|
||||
|
||||
### `kubectl` cannot connect to the cluster
|
||||
|
||||
```
|
||||
error: unable to connect to a kubernetes cluster
|
||||
```
|
||||
|
||||
After Harmony creates the cluster, it writes the kubeconfig to `~/.kube/config` or to the path in `KUBECONFIG`. Verify:
|
||||
|
||||
```bash
|
||||
kubectl cluster-info --context k3d-harmony-postgres-example
|
||||
```
|
||||
|
||||
### Port forward fails
|
||||
|
||||
```
|
||||
error: unable to forward port
|
||||
```
|
||||
|
||||
Make sure no other process is using port 5432, or use a different local port:
|
||||
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 15432:5432
|
||||
psql -h localhost -p 15432 -U postgres
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Explore the Scores Catalog](../catalogs/scores.md): See what other Scores are available
|
||||
- [Explore the Topologies Catalog](../catalogs/topologies.md): See what infrastructure Topologies are supported
|
||||
- [Read the Core Concepts](../concepts.md): Understand the Score / Topology / Interpret pattern in depth
|
||||
- [OKD on Bare Metal](../use-cases/okd-on-bare-metal.md): See a complete bare-metal deployment example
|
||||
|
||||
## Advanced examples
|
||||
|
||||
Once you're comfortable with the basics, these examples demonstrate more advanced use cases. Note that some require specific infrastructure (existing Kubernetes clusters, bare-metal hardware, or multi-cluster environments):
|
||||
|
||||
| Example | Description | Prerequisites |
|
||||
|---------|-------------|---------------|
|
||||
| `monitoring` | Deploy Prometheus alerting with Discord webhooks | Existing K8s cluster |
|
||||
| `ntfy` | Deploy ntfy notification server | Existing K8s cluster |
|
||||
| `tenant` | Create a multi-tenant namespace with quotas | Existing K8s cluster |
|
||||
| `cert_manager` | Provision TLS certificates | Existing K8s cluster |
|
||||
| `validate_ceph_cluster_health` | Check Ceph cluster health | Existing Rook/Ceph cluster |
|
||||
| `okd_pxe` / `okd_installation` | Provision OKD on bare metal | HAClusterTopology, bare-metal hardware |
|
||||
|
||||
To run any example:
|
||||
|
||||
```bash
|
||||
cargo run -p example-<example_name>
|
||||
```
|
||||
|
||||
164
docs/guides/writing-a-score.md
Normal file
164
docs/guides/writing-a-score.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Writing a Score
|
||||
|
||||
A `Score` declares _what_ you want to achieve. It is decoupled from _how_ it is achieved — that logic lives in an `Interpret`.
|
||||
|
||||
## The Pattern
|
||||
|
||||
A Score consists of two parts:
|
||||
|
||||
1. **A struct** — holds the configuration for your desired state
|
||||
2. **A `Score<T>` implementation** — returns an `Interpret` that knows how to execute
|
||||
|
||||
An `Interpret` contains the actual execution logic and connects your Score to the capabilities exposed by a `Topology`.
|
||||
|
||||
## Example: A Simple Score
|
||||
|
||||
Here's a simplified version of `NtfyScore` from the `ntfy` module:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use harmony::{
|
||||
interpret::{Interpret, InterpretError, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
/// MyScore declares "I want to install the ntfy server"
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MyScore {
|
||||
pub namespace: String,
|
||||
pub host: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + HelmCommand + K8sclient> Score<T> for MyScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(MyInterpret { score: self.clone() })
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"ntfy [MyScore]".into()
|
||||
}
|
||||
}
|
||||
|
||||
/// MyInterpret knows _how_ to install ntfy using the Topology's capabilities
|
||||
#[derive(Debug)]
|
||||
pub struct MyInterpret {
|
||||
pub score: MyScore,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + HelmCommand + K8sclient> Interpret<T> for MyInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
// 1. Get a Kubernetes client from the Topology
|
||||
let client = topology.k8s_client().await?;
|
||||
|
||||
// 2. Use Helm to install the ntfy chart
|
||||
// (via topology's HelmCommand capability)
|
||||
|
||||
// 3. Wait for the deployment to be ready
|
||||
client
|
||||
.wait_until_deployment_ready("ntfy", Some(&self.score.namespace), None)
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success("ntfy installed".to_string()))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## The Compile-Time Safety Check
|
||||
|
||||
The generic `Score<T>` trait is bounded by `T: Topology`. This means the compiler enforces that your Score only runs on Topologies that expose the capabilities your Interpret needs:
|
||||
|
||||
```rust
|
||||
// This only compiles if K8sAnywhereTopology (or any T)
|
||||
// implements HelmCommand and K8sclient
|
||||
impl<T: Topology + HelmCommand + K8sclient> Score<T> for MyScore { ... }
|
||||
```
|
||||
|
||||
If you try to run this Score against a Topology that doesn't expose `HelmCommand`, you get a compile error — before any code runs.
|
||||
|
||||
## Using Your Score
|
||||
|
||||
Once defined, your Score integrates with the Harmony CLI:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let my_score = MyScore {
|
||||
namespace: "monitoring".to_string(),
|
||||
host: "ntfy.example.com".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(my_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
## Key Patterns
|
||||
|
||||
### Composing Scores
|
||||
|
||||
Scores can include other Scores via features:
|
||||
|
||||
```rust
|
||||
let app = ApplicationScore {
|
||||
features: vec![
|
||||
Box::new(PackagingDeployment { application: app.clone() }),
|
||||
Box::new(Monitoring { application: app.clone(), alert_receiver: vec![] }),
|
||||
],
|
||||
application: app,
|
||||
};
|
||||
```
|
||||
|
||||
### Reusing Interpret Logic
|
||||
|
||||
Many Scores delegate to shared `Interpret` implementations. For example, `HelmChartScore` provides a reusable Interpret for any Helm-based deployment. Your Score can wrap it:
|
||||
|
||||
```rust
|
||||
impl<T: Topology + HelmCommand> Score<T> for MyScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(HelmChartInterpret { /* your config */ })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Accessing Topology Capabilities
|
||||
|
||||
Your Interpret accesses infrastructure through Capabilities exposed by the Topology:
|
||||
|
||||
```rust
|
||||
// Via the Topology trait directly
|
||||
let k8s_client = topology.k8s_client().await?;
|
||||
let helm = topology.get_helm_command();
|
||||
|
||||
// Or via Capability traits
|
||||
impl<T: Topology + K8sclient> Interpret<T> for MyInterpret {
|
||||
async fn execute(...) {
|
||||
let client = topology.k8s_client().await?;
|
||||
// use client...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Keep Scores focused** — one Score per concern (deployment, monitoring, networking)
|
||||
- **Use `..Default::default()`** for optional fields so callers only need to specify what they care about
|
||||
- **Return `Outcome`** — use `Outcome::success`, `Outcome::failure`, or `Outcome::success_with_details` to communicate results clearly
|
||||
- **Handle errors gracefully** — return meaningful `InterpretError` messages that help operators debug issues
|
||||
176
docs/guides/writing-a-topology.md
Normal file
176
docs/guides/writing-a-topology.md
Normal file
@@ -0,0 +1,176 @@
|
||||
# Writing a Topology
|
||||
|
||||
A `Topology` models your infrastructure environment and exposes `Capability` traits that Scores use to interact with it. Where a Score declares _what_ you want, a Topology exposes _what_ it can do.
|
||||
|
||||
## The Minimum Implementation
|
||||
|
||||
At minimum, a Topology needs:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use harmony::{
|
||||
topology::{PreparationError, PreparationOutcome, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MyTopology {
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Topology for MyTopology {
|
||||
fn name(&self) -> &str {
|
||||
"MyTopology"
|
||||
}
|
||||
|
||||
async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
|
||||
// Verify the infrastructure is accessible and ready
|
||||
Ok(PreparationOutcome::Success { details: "ready".to_string() })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Implementing Capabilities
|
||||
|
||||
Scores express dependencies on Capabilities through trait bounds. For example, if your Topology should support Scores that deploy Helm charts, implement `HelmCommand`:
|
||||
|
||||
```rust
|
||||
use std::process::Command;
|
||||
use harmony::topology::HelmCommand;
|
||||
|
||||
impl HelmCommand for MyTopology {
|
||||
fn get_helm_command(&self) -> Command {
|
||||
let mut cmd = Command::new("helm");
|
||||
if let Some(kubeconfig) = &self.kubeconfig {
|
||||
cmd.arg("--kubeconfig").arg(kubeconfig);
|
||||
}
|
||||
cmd
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
For Scores that need a Kubernetes client, implement `K8sclient`:
|
||||
|
||||
```rust
|
||||
use std::sync::Arc;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony::topology::K8sclient;
|
||||
|
||||
#[async_trait]
|
||||
impl K8sclient for MyTopology {
|
||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||
let client = if let Some(kubeconfig) = &self.kubeconfig {
|
||||
K8sClient::from_kubeconfig(kubeconfig).await?
|
||||
} else {
|
||||
K8sClient::try_default().await?
|
||||
};
|
||||
Ok(Arc::new(client))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Loading Topology from Environment
|
||||
|
||||
For flexibility, implement `from_env()` to read configuration from environment variables:
|
||||
|
||||
```rust
|
||||
impl MyTopology {
|
||||
pub fn from_env() -> Self {
|
||||
Self {
|
||||
name: std::env::var("MY_TOPOLOGY_NAME")
|
||||
.unwrap_or_else(|_| "default".to_string()),
|
||||
kubeconfig: std::env::var("KUBECONFIG").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This pattern lets operators switch between environments without recompiling:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/prod-cluster.kubeconfig
|
||||
cargo run --example my_example
|
||||
```
|
||||
|
||||
## Complete Example: K8sAnywhereTopology
|
||||
|
||||
The `K8sAnywhereTopology` is the most commonly used Topology and handles both local (K3D) and remote Kubernetes clusters:
|
||||
|
||||
```rust
|
||||
pub struct K8sAnywhereTopology {
|
||||
pub k8s_state: Arc<OnceCell<K8sState>>,
|
||||
pub tenant_manager: Arc<OnceCell<TenantManager>>,
|
||||
pub config: Arc<K8sAnywhereConfig>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Topology for K8sAnywhereTopology {
|
||||
fn name(&self) -> &str {
|
||||
"K8sAnywhereTopology"
|
||||
}
|
||||
|
||||
async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
|
||||
// 1. If autoinstall is enabled and no cluster exists, provision K3D
|
||||
// 2. Verify kubectl connectivity
|
||||
// 3. Optionally wait for cluster operators to be ready
|
||||
Ok(PreparationOutcome::Success { details: "cluster ready".to_string() })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Key Patterns
|
||||
|
||||
### Lazy Initialization
|
||||
|
||||
Use `OnceCell` for expensive resources like Kubernetes clients:
|
||||
|
||||
```rust
|
||||
pub struct K8sAnywhereTopology {
|
||||
k8s_state: Arc<OnceCell<K8sState>>,
|
||||
}
|
||||
```
|
||||
|
||||
### Multi-Target Topologies
|
||||
|
||||
For Scores that span multiple clusters (like NATS supercluster), implement `MultiTargetTopology`:
|
||||
|
||||
```rust
|
||||
pub trait MultiTargetTopology: Topology {
|
||||
fn current_target(&self) -> &str;
|
||||
fn set_target(&mut self, target: &str);
|
||||
}
|
||||
```
|
||||
|
||||
### Composing Topologies
|
||||
|
||||
Complex topologies combine multiple infrastructure components:
|
||||
|
||||
```rust
|
||||
pub struct HAClusterTopology {
|
||||
pub router: Arc<dyn Router>,
|
||||
pub load_balancer: Arc<dyn LoadBalancer>,
|
||||
pub firewall: Arc<dyn Firewall>,
|
||||
pub dhcp_server: Arc<dyn DhcpServer>,
|
||||
pub dns_server: Arc<dyn DnsServer>,
|
||||
pub kubeconfig: Option<String>,
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Your Topology
|
||||
|
||||
Test Topologies in isolation by implementing them against mock infrastructure:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_topology_ensure_ready() {
|
||||
let topo = MyTopology::from_env();
|
||||
let result = topo.ensure_ready().await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/use-cases/README.md
Normal file
17
docs/use-cases/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# Use Cases
|
||||
|
||||
Real-world scenarios demonstrating Harmony in action.
|
||||
|
||||
## Available Use Cases
|
||||
|
||||
### [PostgreSQL on Local K3D](./postgresql-on-local-k3d.md)
|
||||
|
||||
Deploy a fully functional PostgreSQL cluster on a local K3D cluster in under 10 minutes. The quickest way to see Harmony in action.
|
||||
|
||||
### [OKD on Bare Metal](./okd-on-bare-metal.md)
|
||||
|
||||
A complete walkthrough of bootstrapping a high-availability OKD cluster from physical hardware. Covers inventory discovery, bootstrap, control plane, and worker provisioning.
|
||||
|
||||
---
|
||||
|
||||
_These use cases are community-tested scenarios. For questions or contributions, open an issue on the [Harmony repository](https://git.nationtech.io/NationTech/harmony/issues)._
|
||||
159
docs/use-cases/okd-on-bare-metal.md
Normal file
159
docs/use-cases/okd-on-bare-metal.md
Normal file
@@ -0,0 +1,159 @@
|
||||
# Use Case: OKD on Bare Metal
|
||||
|
||||
Provision a production-grade OKD (OpenShift Kubernetes Distribution) cluster from physical hardware using Harmony. This use case covers the full lifecycle: hardware discovery, bootstrap, control plane, workers, and post-install validation.
|
||||
|
||||
## What you'll have at the end
|
||||
|
||||
A highly-available OKD cluster with:
|
||||
- 3 control plane nodes
|
||||
- 2+ worker nodes
|
||||
- Network bonding configured on nodes and switches
|
||||
- Load balancer routing API and ingress traffic
|
||||
- DNS and DHCP services for the cluster
|
||||
- Post-install health validation
|
||||
|
||||
## Target hardware model
|
||||
|
||||
This setup assumes a typical lab environment:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Network 192.168.x.0/24 (flat, DHCP + PXE capable) │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ cp0 │ │ cp1 │ │ cp2 │ (control) │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ │
|
||||
│ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ wk0 │ │ wk1 │ ... (workers) │
|
||||
│ └──────────┘ └──────────┘ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ bootstrap│ (temporary, can be repurposed) │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ firewall │ │ switch │ (OPNsense + Brocade) │
|
||||
│ └──────────┘ └──────────┘ │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Required infrastructure
|
||||
|
||||
Harmony models this as an `HAClusterTopology`, which requires these capabilities:
|
||||
|
||||
| Capability | Implementation |
|
||||
|------------|---------------|
|
||||
| **Router** | OPNsense firewall |
|
||||
| **Load Balancer** | OPNsense HAProxy |
|
||||
| **Firewall** | OPNsense |
|
||||
| **DHCP Server** | OPNsense |
|
||||
| **TFTP Server** | OPNsense |
|
||||
| **HTTP Server** | OPNsense |
|
||||
| **DNS Server** | OPNsense |
|
||||
| **Node Exporter** | Prometheus node_exporter on OPNsense |
|
||||
| **Switch Client** | Brocade SNMP |
|
||||
|
||||
See `examples/okd_installation/` for a reference topology implementation.
|
||||
|
||||
## The Provisioning Pipeline
|
||||
|
||||
Harmony orchestrates OKD installation in ordered stages:
|
||||
|
||||
### Stage 1: Inventory Discovery (`OKDSetup01InventoryScore`)
|
||||
|
||||
Harmony boots all nodes via PXE into a CentOS Stream live environment, runs an inventory agent on each, and collects:
|
||||
- MAC addresses and NIC details
|
||||
- IP addresses assigned by DHCP
|
||||
- Hardware profile (CPU, RAM, storage)
|
||||
|
||||
This is the "discovery-first" approach: no pre-configuration required on nodes.
|
||||
|
||||
### Stage 2: Bootstrap Node (`OKDSetup02BootstrapScore`)
|
||||
|
||||
The user selects one discovered node to serve as the bootstrap node. Harmony:
|
||||
- Renders per-MAC iPXE boot configuration with OKD 4.19 SCOS live assets + ignition
|
||||
- Reboots the bootstrap node via SSH
|
||||
- Waits for the bootstrap process to complete (API server becomes available)
|
||||
|
||||
### Stage 3: Control Plane (`OKDSetup03ControlPlaneScore`)
|
||||
|
||||
With bootstrap complete, Harmony provisions the control plane nodes:
|
||||
- Renders per-MAC iPXE for each control plane node
|
||||
- Reboots via SSH and waits for node to join the cluster
|
||||
- Applies network bond configuration via NMState MachineConfig where relevant
|
||||
|
||||
### Stage 4: Network Bonding (`OKDSetupPersistNetworkBondScore`)
|
||||
|
||||
Configures LACP bonds on nodes and corresponding port-channels on the switch stack for high-availability.
|
||||
|
||||
### Stage 5: Worker Nodes (`OKDSetup04WorkersScore`)
|
||||
|
||||
Provisions worker nodes similarly to control plane, joining them to the cluster.
|
||||
|
||||
### Stage 6: Sanity Check (`OKDSetup05SanityCheckScore`)
|
||||
|
||||
Validates:
|
||||
- API server is reachable
|
||||
- Ingress controller is operational
|
||||
- Cluster operators are healthy
|
||||
- SDN (software-defined networking) is functional
|
||||
|
||||
### Stage 7: Installation Report (`OKDSetup06InstallationReportScore`)
|
||||
|
||||
Produces a machine-readable JSON report and human-readable summary of the installation.
|
||||
|
||||
## Network notes
|
||||
|
||||
**During discovery:** Ports must be in access mode (no LACP). DHCP succeeds; iPXE loads CentOS Stream live with Kickstart and starts the inventory endpoint.
|
||||
|
||||
**During provisioning:** After SCOS is on disk and Ignition/MachineConfig can be applied, bonds are set persistently. This avoids the PXE/DHCP recovery race condition that occurs if bonding is configured too early.
|
||||
|
||||
**PXE limitation:** The generic discovery path cannot use bonded networks for PXE boot because the DHCP recovery process conflicts with bond formation.
|
||||
|
||||
## Configuration knobs
|
||||
|
||||
When using `OKDInstallationPipeline`, configure these domains:
|
||||
|
||||
| Parameter | Example | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `public_domain` | `apps.example.com` | Wildcard domain for application ingress |
|
||||
| `internal_domain` | `cluster.local` | Internal cluster DNS domain |
|
||||
|
||||
## Running the example
|
||||
|
||||
See `examples/okd_installation/` for a complete reference. The topology must be configured with your infrastructure details:
|
||||
|
||||
```bash
|
||||
# Configure the example with your hardware/network specifics
|
||||
# See examples/okd_installation/src/topology.rs
|
||||
|
||||
cargo run -p example-okd_installation
|
||||
```
|
||||
|
||||
This example requires:
|
||||
- Physical hardware configured as described above
|
||||
- OPNsense firewall with SSH access
|
||||
- Brocade switch with SNMP access
|
||||
- All nodes connected to the same Layer 2 network
|
||||
|
||||
## Post-install
|
||||
|
||||
After the cluster is bootstrapped, `~/.kube/config` is updated with the cluster credentials. Verify:
|
||||
|
||||
```bash
|
||||
kubectl get nodes
|
||||
kubectl get pods -n openshift-monitoring
|
||||
oc get routes -n openshift-console
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
- Enable monitoring with `PrometheusAlertScore` or `OpenshiftClusterAlertScore`
|
||||
- Configure TLS certificates with `CertManagerHelmScore`
|
||||
- Add storage with Rook Ceph
|
||||
- Scale workers with `OKDSetup04WorkersScore`
|
||||
|
||||
## Further reading
|
||||
|
||||
- [OKD Installation Module](../../harmony/src/modules/okd/installation.rs) — source of truth for pipeline stages
|
||||
- [HAClusterTopology](../../harmony/src/domain/topology/ha_cluster.rs) — infrastructure capability model
|
||||
- [Scores Catalog](../catalogs/scores.md) — all available Scores including OKD-specific ones
|
||||
115
docs/use-cases/postgresql-on-local-k3d.md
Normal file
115
docs/use-cases/postgresql-on-local-k3d.md
Normal file
@@ -0,0 +1,115 @@
|
||||
# Use Case: PostgreSQL on Local K3D
|
||||
|
||||
Deploy a production-grade PostgreSQL cluster on a local Kubernetes cluster (K3D) using Harmony. This is the fastest way to get started with Harmony and requires no external infrastructure.
|
||||
|
||||
## What you'll have at the end
|
||||
|
||||
A fully operational PostgreSQL cluster with:
|
||||
- 1 primary instance with 1 GiB of storage
|
||||
- CloudNativePG operator managing the cluster lifecycle
|
||||
- Automatic failover support (foundation for high-availability)
|
||||
- Exposed as a Kubernetes Service for easy connection
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Rust 2024 edition
|
||||
- Docker running locally
|
||||
- ~5 minutes
|
||||
|
||||
## The Score
|
||||
|
||||
The entire deployment is expressed in ~20 lines of Rust:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
## What Harmony does
|
||||
|
||||
When you run this, Harmony:
|
||||
|
||||
1. **Connects to K8sAnywhereTopology** — this auto-provisions a K3D cluster if none exists
|
||||
2. **Installs the CloudNativePG operator** — one-time setup that enables PostgreSQL cluster management in Kubernetes
|
||||
3. **Creates a PostgreSQL cluster** — Harmony translates the Score into a `Cluster` CRD and applies it
|
||||
4. **Exposes the database** — creates a Kubernetes Service for the PostgreSQL primary
|
||||
|
||||
## Running it
|
||||
|
||||
```bash
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
## Verifying the deployment
|
||||
|
||||
```bash
|
||||
# Check pods
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
|
||||
# Get the password
|
||||
PASSWORD=$(kubectl get secret -n harmony-postgres-example \
|
||||
harmony-postgres-example-db-user \
|
||||
-o jsonpath='{.data.password}' | base64 -d)
|
||||
|
||||
# Connect via port-forward
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
psql -h localhost -p 5432 -U postgres -W "$PASSWORD"
|
||||
```
|
||||
|
||||
## Customizing the deployment
|
||||
|
||||
The `PostgreSQLConfig` struct supports:
|
||||
|
||||
| Field | Default | Description |
|
||||
|-------|---------|-------------|
|
||||
| `cluster_name` | — | Name of the PostgreSQL cluster |
|
||||
| `namespace` | — | Kubernetes namespace to deploy to |
|
||||
| `instances` | `1` | Number of instances |
|
||||
| `storage_size` | `1Gi` | Persistent storage size per instance |
|
||||
|
||||
Example with custom settings:
|
||||
|
||||
```rust
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "my-prod-db".to_string(),
|
||||
namespace: "database".to_string(),
|
||||
instances: 3,
|
||||
storage_size: "10Gi".to_string().into(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
## Extending the pattern
|
||||
|
||||
This pattern extends to any Kubernetes-native workload:
|
||||
|
||||
- Add **monitoring** by including a `Monitoring` feature alongside your Score
|
||||
- Add **TLS certificates** by including a `CertificateScore`
|
||||
- Add **tenant isolation** by wrapping in a `TenantScore`
|
||||
|
||||
See [Scores Catalog](../catalogs/scores.md) for the full list.
|
||||
127
examples/README.md
Normal file
127
examples/README.md
Normal file
@@ -0,0 +1,127 @@
|
||||
# Examples
|
||||
|
||||
This directory contains runnable examples demonstrating Harmony's capabilities. Each example is a self-contained program that can be run with `cargo run -p example-<name>`.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Example | Description | Local K3D | Existing Cluster | Hardware Needed |
|
||||
|---------|-------------|:---------:|:----------------:|:---------------:|
|
||||
| `postgresql` | Deploy a PostgreSQL cluster | ✅ | ✅ | — |
|
||||
| `ntfy` | Deploy ntfy notification server | ✅ | ✅ | — |
|
||||
| `tenant` | Create a multi-tenant namespace | ✅ | ✅ | — |
|
||||
| `cert_manager` | Provision TLS certificates | ✅ | ✅ | — |
|
||||
| `node_health` | Check Kubernetes node health | ✅ | ✅ | — |
|
||||
| `monitoring` | Deploy Prometheus alerting | ✅ | ✅ | — |
|
||||
| `monitoring_with_tenant` | Monitoring + tenant isolation | ✅ | ✅ | — |
|
||||
| `operatorhub_catalog` | Install OperatorHub catalog | ✅ | ✅ | — |
|
||||
| `validate_ceph_cluster_health` | Verify Ceph cluster health | — | ✅ | Rook/Ceph |
|
||||
| `remove_rook_osd` | Remove a Rook OSD | — | ✅ | Rook/Ceph |
|
||||
| `brocade_snmp_server` | Configure Brocade switch SNMP | — | ✅ | Brocade switch |
|
||||
| `opnsense_node_exporter` | Node exporter on OPNsense | — | ✅ | OPNsense firewall |
|
||||
| `okd_pxe` | PXE boot configuration for OKD | — | — | ✅ |
|
||||
| `okd_installation` | Full OKD bare-metal install | — | — | ✅ |
|
||||
| `okd_cluster_alerts` | OKD cluster monitoring alerts | — | ✅ | OKD cluster |
|
||||
| `multisite_postgres` | Multi-site PostgreSQL failover | — | ✅ | Multi-cluster |
|
||||
| `nats` | Deploy NATS messaging | — | ✅ | Multi-cluster |
|
||||
| `nats-supercluster` | NATS supercluster across sites | — | ✅ | Multi-cluster |
|
||||
| `lamp` | LAMP stack deployment | ✅ | ✅ | — |
|
||||
| `openbao` | Deploy OpenBao vault | ✅ | ✅ | — |
|
||||
| `zitadel` | Deploy Zitadel identity provider | ✅ | ✅ | — |
|
||||
| `try_rust_webapp` | Rust webapp with packaging | ✅ | ✅ | Submodule |
|
||||
| `rust` | Rust webapp with full monitoring | ✅ | ✅ | — |
|
||||
| `rhob_application_monitoring` | RHOB monitoring setup | ✅ | ✅ | — |
|
||||
| `sttest` | Full OKD stack test | — | — | ✅ |
|
||||
| `application_monitoring_with_tenant` | App monitoring + tenant | — | ✅ | OKD cluster |
|
||||
| `kube-rs` | Direct kube-rs client usage | ✅ | ✅ | — |
|
||||
| `k8s_drain_node` | Drain a Kubernetes node | ✅ | ✅ | — |
|
||||
| `k8s_write_file_on_node` | Write files to K8s nodes | ✅ | ✅ | — |
|
||||
| `harmony_inventory_builder` | Discover hosts via subnet scan | ✅ | — | — |
|
||||
| `cli` | CLI tool with inventory discovery | ✅ | — | — |
|
||||
| `tui` | Terminal UI demonstration | ✅ | — | — |
|
||||
|
||||
## Status Legend
|
||||
|
||||
| Symbol | Meaning |
|
||||
|--------|---------|
|
||||
| ✅ | Works out-of-the-box |
|
||||
| — | Not applicable or requires specific setup |
|
||||
|
||||
## By Category
|
||||
|
||||
### Data Services
|
||||
- **`postgresql`** — Deploy a PostgreSQL cluster via CloudNativePG
|
||||
- **`multisite_postgres`** — Multi-site PostgreSQL with failover
|
||||
- **`public_postgres`** — Public-facing PostgreSQL (⚠️ uses NationTech DNS)
|
||||
|
||||
### Kubernetes Utilities
|
||||
- **`node_health`** — Check node health in a cluster
|
||||
- **`k8s_drain_node`** — Drain and reboot a node
|
||||
- **`k8s_write_file_on_node`** — Write files to nodes
|
||||
- **`validate_ceph_cluster_health`** — Verify Ceph/Rook cluster health
|
||||
- **`remove_rook_osd`** — Remove an OSD from Rook/Ceph
|
||||
- **`kube-rs`** — Direct Kubernetes client usage demo
|
||||
|
||||
### Monitoring & Alerting
|
||||
- **`monitoring`** — Deploy Prometheus alerting with Discord webhooks
|
||||
- **`monitoring_with_tenant`** — Monitoring with tenant isolation
|
||||
- **`ntfy`** — Deploy ntfy notification server
|
||||
- **`okd_cluster_alerts`** — OKD-specific cluster alerts
|
||||
|
||||
### Application Deployment
|
||||
- **`try_rust_webapp`** — Deploy a Rust webapp with packaging (⚠️ requires `tryrust.org` submodule)
|
||||
- **`rust`** — Rust webapp with full monitoring features
|
||||
- **`rhob_application_monitoring`** — Red Hat Observability Stack monitoring
|
||||
- **`lamp`** — LAMP stack deployment (⚠️ uses NationTech DNS)
|
||||
- **`application_monitoring_with_tenant`** — App monitoring with tenant isolation
|
||||
|
||||
### Infrastructure & Bare Metal
|
||||
- **`okd_installation`** — Full OKD cluster from scratch
|
||||
- **`okd_pxe`** — PXE boot configuration for OKD
|
||||
- **`sttest`** — Full OKD stack test with specific hardware
|
||||
- **`brocade_snmp_server`** — Configure Brocade switch via SNMP
|
||||
- **`opnsense_node_exporter`** — Node exporter on OPNsense firewall
|
||||
|
||||
### Multi-Cluster
|
||||
- **`nats`** — NATS deployment on a cluster
|
||||
- **`nats-supercluster`** — NATS supercluster across multiple sites
|
||||
- **`multisite_postgres`** — PostgreSQL with multi-site failover
|
||||
|
||||
### Identity & Secrets
|
||||
- **`openbao`** — Deploy OpenBao vault (⚠️ uses NationTech DNS)
|
||||
- **`zitadel`** — Deploy Zitadel identity provider (⚠️ uses NationTech DNS)
|
||||
|
||||
### Cluster Services
|
||||
- **`cert_manager`** — Provision TLS certificates
|
||||
- **`tenant`** — Create a multi-tenant namespace
|
||||
- **`operatorhub_catalog`** — Install OperatorHub catalog sources
|
||||
|
||||
### Development & Testing
|
||||
- **`cli`** — CLI tool with inventory discovery
|
||||
- **`tui`** — Terminal UI demonstration
|
||||
- **`harmony_inventory_builder`** — Host discovery via subnet scan
|
||||
|
||||
## Running Examples
|
||||
|
||||
```bash
|
||||
# Build first
|
||||
cargo build --release
|
||||
|
||||
# Run any example
|
||||
cargo run -p example-postgresql
|
||||
cargo run -p example-ntfy
|
||||
cargo run -p example-tenant
|
||||
```
|
||||
|
||||
For examples that need an existing Kubernetes cluster:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
cargo run -p example-monitoring
|
||||
```
|
||||
|
||||
## Notes on Private Infrastructure
|
||||
|
||||
Some examples use NationTech-hosted infrastructure by default (DNS domains like `*.nationtech.io`, `*.harmony.mcd`). These are not suitable for public use without modification. See the [Getting Started Guide](../docs/guides/getting-started.md) for the recommended public examples.
|
||||
@@ -1,16 +0,0 @@
|
||||
[workspace]
|
||||
|
||||
[package]
|
||||
name = "example-cluster-dashboards"
|
||||
edition = "2021"
|
||||
version = "0.1.0"
|
||||
license = "GNU AGPL v3"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
|
||||
log = "0.4"
|
||||
env_logger = "0.11"
|
||||
@@ -1,21 +0,0 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
let cluster_dashboards_score = ClusterDashboardsScore::default();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(cluster_dashboards_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -14,7 +14,6 @@ async fn main() {
|
||||
..Default::default() // Use harmony defaults, they are based on CNPG's default values :
|
||||
// "default" namespace, 1 instance, 1Gi storage
|
||||
},
|
||||
hostname: "postgrestest.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
|
||||
@@ -15,7 +15,6 @@ async fn main() {
|
||||
..Default::default() // Use harmony defaults, they are based on CNPG's default values :
|
||||
// 1 instance, 1Gi storage
|
||||
},
|
||||
hostname: "postgrestest.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
let test_connection = PostgreSQLConnectionScore {
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
//! }
|
||||
//! ```
|
||||
|
||||
use kube::{Error, Resource, ResourceExt, api::DynamicObject};
|
||||
use kube::{Error, Resource, ResourceExt, api::DynamicObject, core::ErrorResponse};
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
|
||||
@@ -117,16 +117,13 @@ impl ResourceBundle {
|
||||
/// Delete all resources in this bundle from the cluster.
|
||||
/// Resources are deleted in reverse order to respect dependencies.
|
||||
pub async fn delete(&self, client: &K8sClient) -> Result<(), Error> {
|
||||
// FIXME delete all in parallel and retry using kube::client::retry::RetryPolicy
|
||||
for res in self.resources.iter().rev() {
|
||||
let api = client.get_api_for_dynamic_object(res, res.namespace().as_deref())?;
|
||||
let name = res.name_any();
|
||||
// FIXME this swallows all errors. Swallowing a 404 is ok but other errors must be
|
||||
// handled properly (such as retrying). A normal error case is when we delete a
|
||||
// resource bundle with dependencies between various resources. Such as a pod with a
|
||||
// dependency on a ClusterRoleBinding. Trying to delete the ClusterRoleBinding first
|
||||
// is expected to fail
|
||||
let _ = api.delete(&name, &kube::api::DeleteParams::default()).await;
|
||||
match api.delete(&name, &kube::api::DeleteParams::default()).await {
|
||||
Ok(_) | Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,13 +2,14 @@ use std::collections::HashMap;
|
||||
|
||||
use k8s_openapi::api::{
|
||||
apps::v1::Deployment,
|
||||
core::v1::{Node, ServiceAccount},
|
||||
core::v1::{Namespace, Node, ServiceAccount},
|
||||
};
|
||||
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
|
||||
use kube::api::ApiResource;
|
||||
use kube::{
|
||||
Error, Resource,
|
||||
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
|
||||
core::ErrorResponse,
|
||||
runtime::conditions,
|
||||
runtime::wait::await_condition,
|
||||
};
|
||||
@@ -313,4 +314,65 @@ impl K8sClient {
|
||||
) -> Result<ObjectList<Node>, Error> {
|
||||
self.list_resources(None, list_params).await
|
||||
}
|
||||
|
||||
pub async fn namespace_exists(&self, name: &str) -> Result<bool, Error> {
|
||||
let api: Api<Namespace> = Api::all(self.client.clone());
|
||||
match api.get_opt(name).await? {
|
||||
Some(_) => Ok(true),
|
||||
None => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_namespace(&self, name: &str) -> Result<Namespace, Error> {
|
||||
let namespace = Namespace {
|
||||
metadata: k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta {
|
||||
name: Some(name.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let api: Api<Namespace> = Api::all(self.client.clone());
|
||||
api.create(&kube::api::PostParams::default(), &namespace)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn wait_for_namespace(
|
||||
&self,
|
||||
name: &str,
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Namespace> = Api::all(self.client.clone());
|
||||
let timeout = timeout.unwrap_or(Duration::from_secs(60));
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Api(ErrorResponse {
|
||||
status: "Timeout".to_string(),
|
||||
message: format!("Namespace '{}' not ready within timeout", name),
|
||||
reason: "Timeout".to_string(),
|
||||
code: 408,
|
||||
}));
|
||||
}
|
||||
|
||||
match api.get_opt(name).await? {
|
||||
Some(ns) => {
|
||||
if let Some(status) = ns.status {
|
||||
if status.phase == Some("Active".to_string()) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(Error::Api(ErrorResponse {
|
||||
status: "NotFound".to_string(),
|
||||
message: format!("Namespace '{}' not found", name),
|
||||
reason: "NotFound".to_string(),
|
||||
code: 404,
|
||||
}));
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ impl Default for DrainOptions {
|
||||
Self {
|
||||
delete_emptydir_data: false,
|
||||
ignore_daemonsets: true,
|
||||
timeout: Duration::from_secs(1),
|
||||
timeout: Duration::from_secs(120),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,6 +109,13 @@ impl K8sclient for K8sAnywhereTopology {
|
||||
|
||||
#[async_trait]
|
||||
impl TlsRouter for K8sAnywhereTopology {
|
||||
async fn get_public_domain(&self) -> Result<String, String> {
|
||||
match &self.config.public_domain {
|
||||
Some(public_domain) => Ok(public_domain.to_string()),
|
||||
None => Err("Public domain not available".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_internal_domain(&self) -> Result<Option<String>, String> {
|
||||
match self.get_k8s_distribution().await.map_err(|e| {
|
||||
format!(
|
||||
@@ -1124,6 +1131,7 @@ pub struct K8sAnywhereConfig {
|
||||
///
|
||||
/// If the context name is not found, it will fail to initialize.
|
||||
pub k8s_context: Option<String>,
|
||||
public_domain: Option<String>,
|
||||
}
|
||||
|
||||
impl K8sAnywhereConfig {
|
||||
@@ -1151,6 +1159,7 @@ impl K8sAnywhereConfig {
|
||||
|
||||
let mut kubeconfig: Option<String> = None;
|
||||
let mut k8s_context: Option<String> = None;
|
||||
let mut public_domain: Option<String> = None;
|
||||
|
||||
for part in env_var_value.split(',') {
|
||||
let kv: Vec<&str> = part.splitn(2, '=').collect();
|
||||
@@ -1158,6 +1167,7 @@ impl K8sAnywhereConfig {
|
||||
match kv[0].trim() {
|
||||
"kubeconfig" => kubeconfig = Some(kv[1].trim().to_string()),
|
||||
"context" => k8s_context = Some(kv[1].trim().to_string()),
|
||||
"public_domain" => public_domain = Some(kv[1].trim().to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
@@ -1175,6 +1185,7 @@ impl K8sAnywhereConfig {
|
||||
K8sAnywhereConfig {
|
||||
kubeconfig,
|
||||
k8s_context,
|
||||
public_domain,
|
||||
use_system_kubeconfig,
|
||||
autoinstall: false,
|
||||
use_local_k3d: false,
|
||||
@@ -1217,6 +1228,7 @@ impl K8sAnywhereConfig {
|
||||
use_local_k3d: std::env::var("HARMONY_USE_LOCAL_K3D")
|
||||
.map_or_else(|_| true, |v| v.parse().ok().unwrap_or(true)),
|
||||
k8s_context: std::env::var("HARMONY_K8S_CONTEXT").ok(),
|
||||
public_domain: std::env::var("HARMONY_PUBLIC_DOMAIN").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,4 +122,6 @@ pub trait TlsRouter: Send + Sync {
|
||||
|
||||
/// Returns the port that this router exposes externally.
|
||||
async fn get_router_port(&self) -> u16;
|
||||
|
||||
async fn get_public_domain(&self) -> Result<String, String>;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use harmony_types::{
|
||||
net::{IpAddress, MacAddress},
|
||||
switch::{PortDeclaration, PortLocation},
|
||||
};
|
||||
use log::info;
|
||||
use log::{info, warn};
|
||||
use option_ext::OptionExt;
|
||||
|
||||
use crate::{
|
||||
@@ -44,12 +44,16 @@ impl SwitchClient for BrocadeSwitchClient {
|
||||
.await
|
||||
.map_err(|e| SwitchError::new(e.to_string()))?;
|
||||
|
||||
info!("Brocade found stack topology {stack_topology:#?}");
|
||||
|
||||
let interfaces = self
|
||||
.brocade
|
||||
.get_interfaces()
|
||||
.await
|
||||
.map_err(|e| SwitchError::new(e.to_string()))?;
|
||||
|
||||
info!("Brocade found interfaces {interfaces:#?}");
|
||||
|
||||
let interfaces: Vec<(String, PortOperatingMode)> = interfaces
|
||||
.into_iter()
|
||||
.filter(|interface| {
|
||||
@@ -69,9 +73,9 @@ impl SwitchClient for BrocadeSwitchClient {
|
||||
}
|
||||
|
||||
info!("About to configure interfaces {interfaces:?}");
|
||||
// inquire::Confirm::new("Do you wish to configures interfaces now?")
|
||||
// .prompt()
|
||||
// .map_err(|e| SwitchError::new(e.to_string()))?;
|
||||
inquire::Confirm::new("Do you wish to configures interfaces now?")
|
||||
.prompt()
|
||||
.map_err(|e| SwitchError::new(e.to_string()))?;
|
||||
|
||||
self.brocade
|
||||
.configure_interfaces(&interfaces)
|
||||
@@ -113,16 +117,47 @@ impl SwitchClient for BrocadeSwitchClient {
|
||||
channel_name: &str,
|
||||
switch_ports: Vec<PortLocation>,
|
||||
) -> Result<u8, SwitchError> {
|
||||
let channel_id = self
|
||||
let mut channel_id = self
|
||||
.brocade
|
||||
.find_available_channel_id()
|
||||
.await
|
||||
.map_err(|e| SwitchError::new(format!("{e}")))?;
|
||||
|
||||
self.brocade
|
||||
info!("Found next available channel id : {channel_id}");
|
||||
|
||||
loop {
|
||||
match self
|
||||
.brocade
|
||||
.create_port_channel(channel_id, channel_name, &switch_ports)
|
||||
.await
|
||||
.map_err(|e| SwitchError::new(format!("{e}")))?;
|
||||
.map_err(|e| SwitchError::new(format!("{e}")))
|
||||
{
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"Successfully configured port channel {channel_id} {channel_name} for ports {switch_ports:?}"
|
||||
);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Could not configure port channel {channel_id} {channel_name} for ports {switch_ports:?}"
|
||||
);
|
||||
let previous_id = channel_id;
|
||||
|
||||
while previous_id == channel_id {
|
||||
channel_id = inquire::Text::new(
|
||||
"Type the port channel number to use (or CTRL+C to exit) :",
|
||||
)
|
||||
.prompt()
|
||||
.map_err(|e| {
|
||||
SwitchError::new(format!("Failed to prompt for channel id : {e}"))
|
||||
})?
|
||||
.parse()
|
||||
.unwrap_or(channel_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(channel_id)
|
||||
}
|
||||
@@ -202,6 +237,7 @@ mod tests {
|
||||
use crate::{infra::brocade::BrocadeSwitchClient, topology::SwitchClient};
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires interactive TTY for confirmation prompt"]
|
||||
async fn setup_should_configure_ethernet_interfaces_as_access_ports() {
|
||||
let first_interface = given_interface()
|
||||
.with_port_location(PortLocation(1, 0, 1))
|
||||
|
||||
@@ -192,6 +192,9 @@ impl NetworkManager for OpenShiftNmStateNetworkManager {
|
||||
"Writing NetworkManager configuration files to node '{}'...",
|
||||
node_name
|
||||
);
|
||||
|
||||
debug!("Files to write : {files:#?}");
|
||||
|
||||
self.k8s_client
|
||||
.write_files_to_node(&node_name, &files)
|
||||
.await
|
||||
@@ -226,6 +229,15 @@ impl NetworkManager for OpenShiftNmStateNetworkManager {
|
||||
}
|
||||
}
|
||||
|
||||
let reboot_now = "Reboot now";
|
||||
let continue_without_reboot = "Continue process without rebooting";
|
||||
let options = vec![reboot_now, continue_without_reboot];
|
||||
|
||||
let should_reboot_answer = inquire::Select::new("NetworkManager configuration files written, inspect output and node state and confirm to go ahead with reboot", options)
|
||||
.prompt()
|
||||
.map_err(|e| NetworkError::new(format!("Failed to get confirmation from user : {e}")))?;
|
||||
|
||||
if should_reboot_answer == reboot_now {
|
||||
// 4. Reboot the node with full verification
|
||||
// The reboot_node function handles: drain, boot_id capture, reboot, NotReady wait,
|
||||
// Ready wait, boot_id verification, and uncordon
|
||||
@@ -246,6 +258,7 @@ impl NetworkManager for OpenShiftNmStateNetworkManager {
|
||||
.map_err(|e| {
|
||||
NetworkError::new(format!("Failed to reboot node '{}': {}", node_name, e))
|
||||
})?;
|
||||
}
|
||||
|
||||
info!(
|
||||
"Successfully configured bond on primary interface for host '{}' (node '{}')",
|
||||
|
||||
@@ -267,10 +267,16 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
SSL::Default => "".into(),
|
||||
SSL::Other(other) => other.as_str().into(),
|
||||
};
|
||||
let path_without_query = path.split_once('?').map_or(path.as_str(), |(p, _)| p);
|
||||
let (port, port_name) = match port {
|
||||
Some(port) => (Some(port.to_string()), port.to_string()),
|
||||
None => (None, "serverport".to_string()),
|
||||
};
|
||||
|
||||
let haproxy_check = HAProxyHealthCheck {
|
||||
name: format!("HTTP_{http_method}_{path}"),
|
||||
name: format!("HTTP_{http_method}_{path_without_query}_{port_name}"),
|
||||
uuid: Uuid::new_v4().to_string(),
|
||||
http_method: http_method.to_string().into(),
|
||||
http_method: http_method.to_string().to_lowercase().into(),
|
||||
health_check_type: "http".to_string(),
|
||||
http_uri: path.clone().into(),
|
||||
interval: "2s".to_string(),
|
||||
@@ -314,7 +320,10 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
let mut backend = HAProxyBackend {
|
||||
uuid: Uuid::new_v4().to_string(),
|
||||
enabled: 1,
|
||||
name: format!("backend_{}", service.listening_port),
|
||||
name: format!(
|
||||
"backend_{}",
|
||||
service.listening_port.to_string().replace(':', "_")
|
||||
),
|
||||
algorithm: "roundrobin".to_string(),
|
||||
random_draws: Some(2),
|
||||
stickiness_expire: "30m".to_string(),
|
||||
@@ -346,10 +355,22 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
|
||||
let frontend = Frontend {
|
||||
uuid: uuid::Uuid::new_v4().to_string(),
|
||||
enabled: 1,
|
||||
name: format!("frontend_{}", service.listening_port),
|
||||
name: format!(
|
||||
"frontend_{}",
|
||||
service.listening_port.to_string().replace(':', "_")
|
||||
),
|
||||
bind: service.listening_port.to_string(),
|
||||
mode: "tcp".to_string(), // TODO do not depend on health check here
|
||||
default_backend: Some(backend.uuid.clone()),
|
||||
stickiness_expire: "30m".to_string().into(),
|
||||
stickiness_size: "50k".to_string().into(),
|
||||
stickiness_conn_rate_period: "10s".to_string().into(),
|
||||
stickiness_sess_rate_period: "10s".to_string().into(),
|
||||
stickiness_http_req_rate_period: "10s".to_string().into(),
|
||||
stickiness_http_err_rate_period: "10s".to_string().into(),
|
||||
stickiness_bytes_in_rate_period: "1m".to_string().into(),
|
||||
stickiness_bytes_out_rate_period: "1m".to_string().into(),
|
||||
ssl_hsts_max_age: 15768000,
|
||||
..Default::default()
|
||||
};
|
||||
info!("HAPRoxy frontend and backend mode currently hardcoded to tcp");
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -1,43 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access
|
||||
rules:
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- prometheuses/api
|
||||
verbs:
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-prometheus-api-access-binding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-prometheus-api-access
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: grafana-cluster-monitoring-view
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-monitoring-view
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-grafana-sa
|
||||
namespace: observability
|
||||
@@ -1,43 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: Grafana
|
||||
metadata:
|
||||
name: cluster-grafana
|
||||
namespace: observability
|
||||
labels:
|
||||
dashboards: "grafana"
|
||||
spec:
|
||||
serviceAccountName: cluster-grafana-sa
|
||||
automountServiceAccountToken: true
|
||||
|
||||
config:
|
||||
log:
|
||||
mode: console
|
||||
|
||||
security:
|
||||
admin_user: admin
|
||||
admin_password: paul
|
||||
|
||||
users:
|
||||
viewers_can_edit: "false"
|
||||
|
||||
auth:
|
||||
disable_login_form: "false"
|
||||
|
||||
auth.anonymous:
|
||||
enabled: "true"
|
||||
org_role: Viewer
|
||||
|
||||
deployment:
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 2Gi
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-prometheus-token
|
||||
namespace: observability
|
||||
annotations:
|
||||
kubernetes.io/service-account.name: cluster-grafana-sa
|
||||
type: kubernetes.io/service-account-token
|
||||
@@ -1,27 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDatasource
|
||||
metadata:
|
||||
name: prometheus-cluster
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
valuesFrom:
|
||||
- targetPath: "secureJsonData.httpHeaderValue1"
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-prometheus-token
|
||||
key: token
|
||||
datasource:
|
||||
name: Prometheus-Cluster
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: https://prometheus-k8s.openshift-monitoring.svc:9091
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpHeaderName1: "Authorization"
|
||||
tlsSkipVerify: true
|
||||
timeInterval: "30s"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "Bearer ${token}"
|
||||
@@ -1,14 +0,0 @@
|
||||
apiVersion: route.openshift.io/v1
|
||||
kind: Route
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: observability
|
||||
spec:
|
||||
to:
|
||||
kind: Service
|
||||
name: cluster-grafana-service
|
||||
port:
|
||||
targetPort: 3000
|
||||
tls:
|
||||
termination: edge
|
||||
insecureEdgeTerminationPolicy: Redirect
|
||||
@@ -1,97 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster CPU Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cluster Memory Usage (%)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus-Cluster"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
|
||||
}
|
||||
]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,769 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-cluster-overview
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"uid": "okd-cluster-overview",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "cluster", "overview"],
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Critical Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Warning Alerts",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 75 },
|
||||
{ "color": "red", "value": 90 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "gauge",
|
||||
"title": "Root Disk Usage",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"orientation": "auto"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "stat",
|
||||
"title": "etcd Has Leader",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(etcd_server_has_leader)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "LEADER OK", "color": "green" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"unit": "short",
|
||||
"noValue": "?"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "stat",
|
||||
"title": "API Servers Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"apiserver\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 2 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "stat",
|
||||
"title": "etcd Members Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"etcd\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 2 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "stat",
|
||||
"title": "Operators Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short",
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic — Cluster Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Receive"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "Transmit"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Receive" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Transmit" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phases Over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Running"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Failed"
|
||||
},
|
||||
{
|
||||
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
|
||||
"refId": "D",
|
||||
"legendFormat": "Unknown"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Running" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pending" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Failed" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unknown" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": ["lastNotNull"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,637 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-node-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Node Health",
|
||||
"uid": "okd-node-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 2,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "node", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "node",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Node",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Total Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Not Ready Nodes",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Memory Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Disk Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "PID Pressure",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7,
|
||||
"type": "stat",
|
||||
"title": "Unschedulable",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Kubelet Up",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9,
|
||||
"type": "table",
|
||||
"title": "Node Conditions",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "B",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "C",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
|
||||
"refId": "D",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
|
||||
"refId": "E",
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { "mode": "columns" }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "node", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"Value #A": "Ready",
|
||||
"Value #B": "Mem Pressure",
|
||||
"Value #C": "Disk Pressure",
|
||||
"Value #D": "PID Pressure",
|
||||
"Value #E": "Unschedulable"
|
||||
},
|
||||
"indexByName": {
|
||||
"node": 0,
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "displayMode": "color-background", "align": "center" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Node" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "auto" },
|
||||
{ "id": "custom.align", "value": "left" },
|
||||
{ "id": "custom.width", "value": 200 }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
|
||||
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ OK", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unschedulable" },
|
||||
"properties": [
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
|
||||
},
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
|
||||
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11,
|
||||
"type": "bargauge",
|
||||
"title": "CPU Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "bargauge",
|
||||
"title": "Memory Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Root Disk Usage per Node (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage \u2014 Current",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "rx {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
|
||||
"refId": "B",
|
||||
"legendFormat": "tx {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "bargauge",
|
||||
"title": "Pods per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "System Load Average (1m) per Node",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"refId": "A",
|
||||
"legendFormat": "1m \u2014 {{instance}}"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"refId": "B",
|
||||
"legendFormat": "5m \u2014 {{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19,
|
||||
"type": "bargauge",
|
||||
"title": "Node Uptime",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "time() - node_boot_time_seconds",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "green", "value": 3600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": false,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,783 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-workload-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Workload Health",
|
||||
"uid": "okd-workload-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 3,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "workload", "health"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Total Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Running Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Running\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Pending Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Failed Pods",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_status_phase{phase=\"Failed\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "Deployments Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_condition{condition=\"Available\",status=\"true\",namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Deployments Degraded",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"} > 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Deployments", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployment Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_spec_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_available{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,deployment)(kube_deployment_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "E",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "deployment", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "deployment",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"deployment": "Deployment",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Available",
|
||||
"Value 3": "Unavailable",
|
||||
"Value 4": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"deployment": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "displayName": "Namespace", "desc": false }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Deployment" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "sortBy": [{ "displayName": "Namespace", "desc": false }] },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "row", "title": "StatefulSets & DaemonSets", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12,
|
||||
"type": "table",
|
||||
"title": "StatefulSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_replicas{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_current{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,statefulset)(kube_statefulset_status_replicas_updated{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "statefulset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "statefulset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"statefulset": "StatefulSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Current",
|
||||
"Value 3": "Up-to-date"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"statefulset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "StatefulSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13,
|
||||
"type": "table",
|
||||
"title": "DaemonSet Status",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_desired_number_scheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_unavailable{namespace=~\"$namespace\"})",
|
||||
"refId": "C",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,daemonset)(kube_daemonset_status_number_misscheduled{namespace=~\"$namespace\"})",
|
||||
"refId": "D",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["namespace", "daemonset", "Value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "daemonset",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"daemonset": "DaemonSet",
|
||||
"Value": "Desired",
|
||||
"Value 1": "Ready",
|
||||
"Value 2": "Unavailable",
|
||||
"Value 3": "Misscheduled"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"daemonset": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Namespace", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "DaemonSet" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Unavailable" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Misscheduled" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "row", "title": "Pods", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Pod Phase over Time",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"type": "piechart",
|
||||
"title": "Pod Phase — Now",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(phase)(kube_pod_status_phase{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "short", "color": { "mode": "palette-classic" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Running" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"pieType": "donut",
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Container Restarts over Time (total counter, top 10)",
|
||||
"description": "Absolute restart counter — each vertical step = a restart event. Flat line = healthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n sum by(namespace, pod) (\n kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}\n ) > 0\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}} / {{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18,
|
||||
"type": "table",
|
||||
"title": "Container Total Restarts (non-zero)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace, pod, container) (kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}) > 0",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"format": "table",
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": { "names": ["namespace", "pod", "container", "Value"] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"container": "Container",
|
||||
"Value": "Total Restarts"
|
||||
},
|
||||
"indexByName": { "namespace": 0, "pod": 1, "container": 2, "Value": 3 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Total Restarts", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Container" }, "properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }] },
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Total Restarts" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Resource Usage", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22,
|
||||
"type": "bargauge",
|
||||
"title": "CPU — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"}[5m]))\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23,
|
||||
"type": "bargauge",
|
||||
"title": "Memory — Actual vs Requested (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace)(container_memory_working_set_bytes{namespace=~\"$namespace\",container!=\"\",container!=\"POD\"})\n/\nsum by(namespace)(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\",container!=\"\"})\n* 100",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 150,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 100 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal", "displayMode": "gradient", "showUnfilled": true,
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,955 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-networking
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Networking",
|
||||
"uid": "okd-networking",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "networking"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(kube_pod_info, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Namespace",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Network RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Network TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "Bps", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "RX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "TX Errors/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "RX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "TX Drops/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
|
||||
"unit": "pps", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "DNS Queries/s",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "reqps", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "DNS Error %",
|
||||
"description": "Percentage of DNS responses with non-NOERROR rcode over the last 5 minutes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_dns_responses_total{rcode!=\"NOERROR\"}[5m])) / sum(rate(coredns_dns_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 2
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Network I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Receive Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Transmit Rate by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "row", "title": "Top Pod Consumers", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "timeseries", "title": "Top 10 Pods — RX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Top 10 Pods — TX Rate",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{namespace}} / {{pod}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15,
|
||||
"type": "table",
|
||||
"title": "Pod Network I/O Summary",
|
||||
"description": "Current RX/TX rates, errors and drops per pod. Sorted by RX rate descending.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "C", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "D", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "E", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,pod)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "F", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "pod", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "pod", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"namespace 1": true,
|
||||
"namespace 2": true,
|
||||
"namespace 3": true,
|
||||
"namespace 4": true,
|
||||
"namespace 5": true
|
||||
},
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"Value": "RX Rate",
|
||||
"Value 1": "TX Rate",
|
||||
"Value 2": "RX Errors/s",
|
||||
"Value 3": "TX Errors/s",
|
||||
"Value 4": "RX Drops/s",
|
||||
"Value 5": "TX Drops/s"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"pod": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3,
|
||||
"Value 2": 4,
|
||||
"Value 3": 5,
|
||||
"Value 4": 6,
|
||||
"Value 5": 7
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "RX Rate", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 160 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Pod" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Rate$|^TX Rate$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "Bps" },
|
||||
{ "id": "custom.displayMode", "value": "color-background-solid" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10000000 },
|
||||
{ "color": "orange", "value": 100000000 },
|
||||
{ "color": "red", "value": 500000000 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Errors/s$|^TX Errors/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^RX Drops/s$|^TX Drops/s$" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "pps" },
|
||||
{ "id": "decimals", "value": 3 },
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 0.001 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "row", "title": "Errors & Packet Loss", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "timeseries", "title": "RX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "TX Errors by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_errors_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "RX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "TX Packet Drops by Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\",pod!=\"\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "DNS (CoreDNS)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "DNS Request Rate by Query Type",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(type)(rate(coredns_dns_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "DNS Response Rate by Rcode",
|
||||
"description": "NOERROR = healthy. NXDOMAIN = name not found. SERVFAIL = upstream error.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(rcode)(rate(coredns_dns_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{rcode}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "DNS Request Latency (p50 / p95 / p99)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "B", "legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "C", "legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "DNS Cache Hit Ratio (%)",
|
||||
"description": "High hit ratio = CoreDNS is serving responses from cache, reducing upstream load.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100",
|
||||
"refId": "A", "legendFormat": "Cache Hit %"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0, "max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "green", "value": 80 }
|
||||
]},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "single" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "DNS Forward Request Rate",
|
||||
"description": "Queries CoreDNS is forwarding upstream. Spike here with cache miss spike = upstream DNS pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_requests_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Forward Requests/s"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(coredns_forward_responses_duration_seconds_count[5m]))",
|
||||
"refId": "B", "legendFormat": "Forward Responses/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "row", "title": "Services & Endpoints", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 61 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "stat", "title": "Total Services",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count(kube_service_info{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "stat", "title": "Endpoint Addresses Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "stat", "title": "Endpoint Addresses Not Ready",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum(kube_endpoint_address_not_ready{namespace=~\"$namespace\"}) or vector(0)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 62 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31,
|
||||
"type": "table",
|
||||
"title": "Endpoint Availability",
|
||||
"description": "Per-endpoint available vs not-ready address counts. Red Not Ready = pods backing this service are unhealthy.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_available{namespace=~\"$namespace\"})",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by(namespace,endpoint)(kube_endpoint_address_not_ready{namespace=~\"$namespace\"})",
|
||||
"refId": "B", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["namespace", "endpoint", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": { "byField": "endpoint", "mode": "outer" }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "namespace 1": true },
|
||||
"renameByName": {
|
||||
"namespace": "Namespace",
|
||||
"endpoint": "Endpoint",
|
||||
"Value": "Available",
|
||||
"Value 1": "Not Ready"
|
||||
},
|
||||
"indexByName": {
|
||||
"namespace": 0,
|
||||
"endpoint": 1,
|
||||
"Value": 2,
|
||||
"Value 1": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Not Ready", "desc": true }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Namespace" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Endpoint" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 220 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Available" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Not Ready" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 66 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "row", "title": "OKD Router / Ingress (HAProxy)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 74 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "timeseries", "title": "Router HTTP Request Rate by Code",
|
||||
"description": "Requires HAProxy router metrics to be scraped (port 1936). OKD exposes these via the openshift-ingress ServiceMonitor.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(code)(rate(haproxy_backend_http_responses_total[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 2xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 4xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "HTTP 5xx" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Router 4xx + 5xx Error Rate (%)",
|
||||
"description": "Client error (4xx) and server error (5xx) rates as a percentage of all requests.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"4xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "A", "legendFormat": "4xx %"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_backend_http_responses_total{code=\"5xx\"}[5m])) / sum(rate(haproxy_backend_http_responses_total[5m])) * 100",
|
||||
"refId": "B", "legendFormat": "5xx %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "4xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "5xx %" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 75 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Router Bytes In / Out",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_in_total[5m]))",
|
||||
"refId": "A", "legendFormat": "Bytes In"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(haproxy_frontend_bytes_out_total[5m]))",
|
||||
"refId": "B", "legendFormat": "Bytes Out"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Bytes In" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Bytes Out" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 83 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36,
|
||||
"type": "table",
|
||||
"title": "Router Backend Server Status",
|
||||
"description": "HAProxy backend servers (routes). Value 0 = DOWN, 1 = UP.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "haproxy_server_up",
|
||||
"refId": "A", "instant": true, "format": "table", "legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": { "include": { "names": ["proxy", "server", "Value"] } }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"renameByName": {
|
||||
"proxy": "Backend",
|
||||
"server": "Server",
|
||||
"Value": "Status"
|
||||
},
|
||||
"indexByName": { "proxy": 0, "server": 1, "Value": 2 }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": { "fields": [{ "displayName": "Status", "desc": false }] }
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "custom": { "align": "center", "displayMode": "auto" } },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Backend" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 260 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Server" },
|
||||
"properties": [{ "id": "custom.align", "value": "left" }, { "id": "custom.width", "value": 180 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "mappings", "value": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
]},
|
||||
{ "id": "thresholds", "value": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]}}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 83 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,607 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: storage-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
|
||||
json: |
|
||||
{
|
||||
"title": "Storage Health",
|
||||
"uid": "storage-health",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 1,
|
||||
"title": "PVC / PV Status",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 2,
|
||||
"title": "Bound PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 0, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Pending PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 4, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Lost PVCs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 8, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "Bound PVs / Available PVs",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolume_status_phase{phase=\"Available\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 12, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Ceph Cluster Health",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_health_status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "HEALTH_OK", "index": 0 },
|
||||
"1": { "text": "HEALTH_WARN", "index": 1 },
|
||||
"2": { "text": "HEALTH_ERR", "index": 2 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "value"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 16, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "OSDs Up / Total",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ceph_osd_up) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Up"
|
||||
},
|
||||
{
|
||||
"expr": "count(ceph_osd_metadata) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"gridPos": { "h": 5, "w": 4, "x": 20, "y": 1 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 8,
|
||||
"title": "Cluster Capacity",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 9,
|
||||
"title": "Ceph Cluster Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes) / ceph_cluster_total_bytes",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 5, "x": 0, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 10,
|
||||
"title": "Ceph Capacity — Total / Available",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes",
|
||||
"refId": "A",
|
||||
"legendFormat": "Total"
|
||||
},
|
||||
{
|
||||
"expr": "ceph_cluster_total_bytes - (ceph_cluster_total_used_raw_bytes or ceph_cluster_total_used_bytes)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Available"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "vertical"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 4, "x": 5, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 11,
|
||||
"title": "PV Allocated Capacity by Storage Class (Bound)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (storageclass) (\n kube_persistentvolume_capacity_bytes\n * on(persistentvolume) group_left(storageclass)\n kube_persistentvolume_status_phase{phase=\"Bound\"}\n)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{storageclass}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "blue", "value": null }]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 7, "x": 9, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "piechart",
|
||||
"id": 12,
|
||||
"title": "PVC Phase Distribution",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) or vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Bound"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) or vector(0)",
|
||||
"refId": "B",
|
||||
"legendFormat": "Pending"
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) or vector(0)",
|
||||
"refId": "C",
|
||||
"legendFormat": "Lost"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "color": { "mode": "palette-classic" } }
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"pieType": "pie",
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 7 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 13,
|
||||
"title": "Ceph Performance",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 14,
|
||||
"title": "Ceph Pool IOPS (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 15,
|
||||
"title": "Ceph Pool Throughput (Read / Write)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ceph_pool_rd_bytes[5m])",
|
||||
"refId": "A",
|
||||
"legendFormat": "Read — pool {{pool_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(ceph_pool_wr_bytes[5m])",
|
||||
"refId": "B",
|
||||
"legendFormat": "Write — pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 16,
|
||||
"title": "Ceph OSD & Pool Details",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 17,
|
||||
"title": "Ceph Pool Space Used (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Pool {{pool_id}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 18,
|
||||
"title": "OSD Status per Daemon (green = Up, red = Down)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ceph_osd_up",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{ceph_daemon}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": { "text": "DOWN", "index": 0 },
|
||||
"1": { "text": "UP", "index": 1 }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "basic",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "row",
|
||||
"id": 19,
|
||||
"title": "Node Disk Usage",
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 20,
|
||||
"title": "Node Root Disk Usage Over Time (%)",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
},
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10 }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }
|
||||
},
|
||||
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 21,
|
||||
"title": "Current Disk Usage — All Nodes & Mountpoints",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay|squashfs\"} * 100)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{instance}} — {{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 70 },
|
||||
{ "color": "red", "value": 85 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,744 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-etcd
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "etcd",
|
||||
"uid": "okd-etcd",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "etcd"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(etcd_server_has_leader, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Cluster Members",
|
||||
"description": "Total number of etcd members currently reporting metrics.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Has Leader",
|
||||
"description": "min() across all members. 0 = at least one member has no quorum — cluster is degraded.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "min(etcd_server_has_leader)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0",
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "NO LEADER", "color": "red" },
|
||||
"1": { "text": "OK", "color": "green" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Leader Changes (1h)",
|
||||
"description": "Number of leader elections in the last hour. ≥3 indicates cluster instability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(changes(etcd_server_leader_changes_seen_total[1h]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "DB Size (Max)",
|
||||
"description": "Largest boltdb file size across all members. Default etcd quota is 8 GiB.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "max(etcd_mvcc_db_total_size_in_bytes)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 2147483648 },
|
||||
{ "color": "orange", "value": 5368709120 },
|
||||
{ "color": "red", "value": 7516192768 }
|
||||
]},
|
||||
"unit": "bytes", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "DB Fragmentation (Max)",
|
||||
"description": "% of DB space that is allocated but unused. >50% → run etcdctl defrag.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "max((etcd_mvcc_db_total_size_in_bytes - etcd_mvcc_db_total_size_in_use_in_bytes) / etcd_mvcc_db_total_size_in_bytes * 100)",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 75 }
|
||||
]},
|
||||
"unit": "percent", "noValue": "0", "decimals": 1
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Failed Proposals/s",
|
||||
"description": "Rate of rejected Raft proposals. Any sustained non-zero value = cluster health problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "WAL Fsync p99",
|
||||
"description": "99th percentile WAL flush-to-disk time. >10ms is concerning; >100ms = serious I/O bottleneck.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Backend Commit p99",
|
||||
"description": "99th percentile boltdb commit time. >25ms = warning; >100ms = critical backend I/O pressure.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.025 },
|
||||
{ "color": "orange", "value": 0.1 },
|
||||
{ "color": "red", "value": 0.25 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Cluster Health", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Has Leader per Instance",
|
||||
"description": "1 = member has a leader; 0 = member lost quorum. A dip to 0 marks the exact moment of a leader election.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_has_leader{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "max": 1.1,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false },
|
||||
"mappings": [
|
||||
{ "type": "value", "options": {
|
||||
"0": { "text": "0 — no leader" },
|
||||
"1": { "text": "1 — ok" }
|
||||
}}
|
||||
]
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": [] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Leader Changes (cumulative)",
|
||||
"description": "Monotonically increasing counter per member. A step jump = one leader election. Correlated jumps across members = cluster-wide event.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_leader_changes_seen_total{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "auto", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "none" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "Slow Operations",
|
||||
"description": "slow_apply: proposals applied slower than expected. slow_read_index: linearizable reads timing out. heartbeat_failures: Raft heartbeat send errors (network partition indicator).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_slow_apply_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Slow Apply — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_slow_read_indexes_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Slow Read Index — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_heartbeat_send_failures_total{instance=~\"$instance\"}[5m])", "refId": "C", "legendFormat": "Heartbeat Failures — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "gRPC Traffic", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "gRPC Request Rate by Method",
|
||||
"description": "Unary calls/s per RPC method. High Put/Txn = heavy write load. High Range = heavy read load. High Watch = many controller watchers.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_method)(rate(grpc_server_started_total{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_method}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "gRPC Error Rate by Status Code",
|
||||
"description": "Non-OK responses by gRPC status code. RESOURCE_EXHAUSTED = overloaded. UNAVAILABLE = leader election. DEADLINE_EXCEEDED = latency spike.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(grpc_code)(rate(grpc_server_handled_total{job=~\".*etcd.*\",grpc_code!=\"OK\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{grpc_code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "gRPC Request Latency (p50 / p95 / p99)",
|
||||
"description": "Unary call handling duration. p99 > 100ms for Put/Txn indicates disk or CPU pressure. p99 > 500ms will cause kube-apiserver timeouts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\",grpc_type=\"unary\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "Raft Proposals", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Proposals Committed vs Applied",
|
||||
"description": "Committed = agreed by Raft quorum. Applied = persisted to boltdb. A widening gap between the two = backend apply backlog (disk too slow to keep up).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "rate(etcd_server_proposals_committed_total{instance=~\"$instance\"}[5m])", "refId": "A", "legendFormat": "Committed — {{instance}}" },
|
||||
{ "expr": "rate(etcd_server_proposals_applied_total{instance=~\"$instance\"}[5m])", "refId": "B", "legendFormat": "Applied — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Proposals Pending",
|
||||
"description": "In-flight Raft proposals not yet committed. Consistently high (>5) = cluster cannot keep up with write throughput.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_server_proposals_pending{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Failed Proposals Rate",
|
||||
"description": "Raft proposals that were rejected. Root causes: quorum loss, leader timeout, network partition between members.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_server_proposals_failed_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 0.001 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 21 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Disk I/O", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 28 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "WAL Fsync Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time to flush the write-ahead log to disk. etcd is extremely sensitive to WAL latency. >10ms p99 = storage is the bottleneck. Correlates directly with Raft commit latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_wal_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Backend Commit Duration (p50 / p95 / p99) per Instance",
|
||||
"description": "Time for boltdb to commit a batch transaction. A spike here while WAL is healthy = backend I/O saturation or boltdb lock contention. Triggers apply backlog.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95 — {{instance}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le,instance)(rate(etcd_disk_backend_commit_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99 — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "row", "title": "Network (Peer & Client)", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "timeseries", "title": "Peer RX Rate",
|
||||
"description": "Bytes received from Raft peers (log replication + heartbeats). A burst during a quiet period = large snapshot being streamed to a recovering member.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Peer TX Rate",
|
||||
"description": "Bytes sent to Raft peers. Leader will have higher TX than followers (it replicates entries to all members).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_peer_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 6, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Client gRPC Received",
|
||||
"description": "Bytes received from API clients (kube-apiserver, operators). Spike = large write burst from controllers or kubectl apply.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_received_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 12, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Client gRPC Sent",
|
||||
"description": "Bytes sent to API clients (responses + watch events). Persistently high = many active Watch streams or large objects being served.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "rate(etcd_network_client_grpc_sent_bytes_total{instance=~\"$instance\"}[5m])",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 6, "x": 18, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "DB Size & Process Resources", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "DB Total vs In-Use Size per Instance",
|
||||
"description": "Total = allocated boltdb file size. In Use = live key data. The gap between them = fragmentation. Steady growth of Total = compaction not keeping up with key churn.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_bytes{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Total — {{instance}}" },
|
||||
{ "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{instance=~\"$instance\"}", "refId": "B", "legendFormat": "In Use — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Process Resident Memory (RSS)",
|
||||
"description": "Physical RAM consumed by the etcd process. Monotonically growing RSS = memory leak or oversized watch cache. Typical healthy range: 500 MiB–2 GiB depending on cluster size.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "etcd_process_resident_memory_bytes{instance=~\"$instance\"}",
|
||||
"refId": "A", "legendFormat": "{{instance}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Open File Descriptors vs Limit",
|
||||
"description": "Open FD count (solid) and process FD limit (dashed). Approaching the limit will cause WAL file creation and new client connections to fail.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "etcd_process_open_fds{instance=~\"$instance\"}", "refId": "A", "legendFormat": "Open — {{instance}}" },
|
||||
{ "expr": "etcd_process_max_fds{instance=~\"$instance\"}", "refId": "B", "legendFormat": "Limit — {{instance}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": "^Limit.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineWidth", "value": 1 },
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [6, 4] } },
|
||||
{ "id": "custom.fillOpacity","value": 0 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 46 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Snapshots", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "Snapshot Save Duration (p50 / p95 / p99)",
|
||||
"description": "Time to write a full snapshot of the boltdb to disk. Slow saves delay Raft log compaction, causing the WAL to grow unboundedly and members to fall further behind.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_debugging_snap_save_total_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "Snapshot DB Fsync Duration (p50 / p95 / p99)",
|
||||
"description": "Time to fsync the snapshot file itself. Distinct from WAL fsync: this is flushing the entire boltdb copy to disk after a snapshot is taken.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(le)(rate(etcd_snap_db_fsync_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,752 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-control-plane-health
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Control Plane Health",
|
||||
"uid": "okd-control-plane",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"tags": ["okd", "control-plane"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(apiserver_request_total, instance)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".*",
|
||||
"label": "API Server Instance",
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "API Servers Up",
|
||||
"description": "Number of kube-apiserver instances currently scraped and up. Healthy HA cluster = 3.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*apiserver.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Controller Managers Up",
|
||||
"description": "kube-controller-manager instances up. In OKD only one holds the leader lease at a time; others are hot standbys.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*controller-manager.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Schedulers Up",
|
||||
"description": "kube-scheduler instances up. One holds the leader lease; rest are standbys. 0 = no scheduling of new pods.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(up{job=~\".*scheduler.*\"} == 1)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "green", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "API 5xx Rate",
|
||||
"description": "Server-side errors (5xx) across all apiserver instances per second. Any sustained non-zero value = apiserver internal fault.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "reqps", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "Inflight — Mutating",
|
||||
"description": "Current in-flight mutating requests (POST/PUT/PATCH/DELETE). Default OKD limit is ~1000. Hitting the limit = 429 errors for writes.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"mutating\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 500 },
|
||||
{ "color": "orange", "value": 750 },
|
||||
{ "color": "red", "value": 900 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "Inflight — Read-Only",
|
||||
"description": "Current in-flight non-mutating requests (GET/LIST/WATCH). Default OKD limit is ~3000. Hitting it = 429 for reads, impacting controllers and kubectl.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(apiserver_current_inflight_requests{request_kind=\"readOnly\"})", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1500 },
|
||||
{ "color": "orange", "value": 2200 },
|
||||
{ "color": "red", "value": 2700 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "API Request p99 (non-WATCH)",
|
||||
"description": "Overall p99 latency for all non-streaming verbs. >1s = noticeable kubectl sluggishness. >10s = controllers timing out on LIST/GET.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~\"WATCH|CONNECT\"}[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 3
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "APIServer → etcd p99",
|
||||
"description": "p99 time apiserver spends waiting on etcd calls. Spike here while WAL fsync is healthy = serialization or large object overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(apiserver_storage_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"refId": "A", "legendFormat": ""
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.05 },
|
||||
{ "color": "orange", "value": 0.2 },
|
||||
{ "color": "red", "value": 0.5 }
|
||||
]},
|
||||
"unit": "s", "noValue": "0", "decimals": 4
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "API Server — Request Rates & Errors", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Request Rate by Verb",
|
||||
"description": "Non-streaming calls per second broken down by verb. GET/LIST = read load from controllers. POST/PUT/PATCH/DELETE = write throughput. A sudden LIST spike = controller cache resync storm.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(verb)(rate(apiserver_request_total{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Error Rate by HTTP Status Code",
|
||||
"description": "4xx/5xx responses per second by code. 429 = inflight limit hit (throttling). 422 = admission rejection or invalid object. 500/503 = internal apiserver fault or etcd unavailability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(code)(rate(apiserver_request_total{instance=~\"$instance\",code=~\"[45]..\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "HTTP {{code}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "timeseries", "title": "In-Flight Requests — Mutating vs Read-Only",
|
||||
"description": "Instantaneous count of requests being actively handled. The two series correspond to the two inflight limit buckets enforced by the apiserver's Priority and Fairness (APF) or legacy inflight settings.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(request_kind)(apiserver_current_inflight_requests{instance=~\"$instance\"})", "refId": "A", "legendFormat": "{{request_kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 20, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "API Server — Latency", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "timeseries", "title": "Request Latency — p50 / p95 / p99 (non-WATCH)",
|
||||
"description": "Aggregated end-to-end request duration across all verbs except WATCH/CONNECT (which are unbounded streaming). A rising p99 without a matching rise in etcd latency = CPU saturation, admission webhook slowness, or serialization overhead.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "timeseries", "title": "Request p99 Latency by Verb",
|
||||
"description": "p99 latency broken out per verb. LIST is inherently slower than GET due to serializing full collections. A POST/PUT spike = heavy admission webhook chain or large object writes. DELETE spikes are usually caused by cascading GC finalizer storms.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(verb,le)(rate(apiserver_request_duration_seconds_bucket{instance=~\"$instance\",verb!~\"WATCH|CONNECT\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{verb}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "APIServer → etcd Latency by Operation",
|
||||
"description": "Time apiserver spends waiting on etcd, split by operation type (get, list, create, update, delete, watch). Elevated get/list = etcd read pressure. Elevated create/update = write bottleneck, likely correlated with WAL fsync latency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "A", "legendFormat": "p50 — {{operation}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(operation,le)(rate(apiserver_storage_request_duration_seconds_bucket[5m])))", "refId": "B", "legendFormat": "p99 — {{operation}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "row", "title": "API Server — Watches & Long-Running Requests", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Active Long-Running Requests (Watches) by Resource",
|
||||
"description": "Instantaneous count of open WATCH streams grouped by resource. Each controller typically holds one WATCH per resource type per apiserver instance. A sudden drop = controller restart; a runaway climb = operator creating watches without cleanup.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(resource)(apiserver_longrunning_requests{instance=~\"$instance\",verb=\"WATCH\"})",
|
||||
"refId": "A", "legendFormat": "{{resource}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "timeseries", "title": "Watch Events Dispatched Rate by Kind",
|
||||
"description": "Watch events sent to all active watchers per second, by object kind. Persistent high rate for a specific kind = that resource type is churning heavily, increasing etcd load and controller reconcile frequency.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(kind)(rate(apiserver_watch_events_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{kind}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "Watch Event Size — p50 / p95 / p99 by Kind",
|
||||
"description": "Size of individual watch events dispatched to clients. Large events (MiB-scale) for Secrets or ConfigMaps = objects being stored with oversized data. Contributes to apiserver memory pressure and network saturation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "A", "legendFormat": "p50 — {{kind}}" },
|
||||
{ "expr": "histogram_quantile(0.99, sum by(kind,le)(rate(apiserver_watch_events_sizes_bucket{instance=~\"$instance\"}[5m])))", "refId": "B", "legendFormat": "p99 — {{kind}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 23 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "row", "title": "Admission Webhooks", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Webhook Call Rate by Name",
|
||||
"description": "Mutating and validating admission webhook invocations per second by webhook name. A webhook invoked on every write (e.g., a mutating webhook with no object selector) can be a major source of write latency amplification.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,type)(rate(apiserver_admission_webhook_request_total{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{type}} — {{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "timeseries", "title": "Webhook Latency p99 by Name",
|
||||
"description": "p99 round-trip time per webhook call (network + webhook server processing). Default apiserver timeout is 10s; a webhook consistently near that limit causes cascading write latency for all resources it intercepts.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{instance=~\"$instance\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2.0 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "timeseries", "title": "Webhook Rejection Rate by Name",
|
||||
"description": "Rate of admission denials per webhook. A validating webhook rejecting requests is expected behaviour; a sudden surge indicates either a newly enforced policy or a misbehaving webhook rejecting valid objects.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(name,error_type)(rate(apiserver_admission_webhook_rejection_count{instance=~\"$instance\"}[5m]))",
|
||||
"refId": "A", "legendFormat": "{{name}} ({{error_type}})"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 31 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "row", "title": "kube-controller-manager", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 26, "type": "timeseries", "title": "Work Queue Depth by Controller",
|
||||
"description": "Items waiting to be reconciled in each controller's work queue. Persistent non-zero depth = controller cannot keep up with the event rate. Identifies which specific controller is the bottleneck during overload incidents.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(workqueue_depth{job=~\".*controller-manager.*\"}))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 27, "type": "timeseries", "title": "Work Queue Item Processing Duration p99 by Controller",
|
||||
"description": "p99 time a work item spends being actively reconciled (inside the reconcile loop, excludes queue wait time). A slow reconcile = either the controller is doing expensive API calls or the etcd write path is slow.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by(name,le)(rate(workqueue_work_duration_seconds_bucket{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 28, "type": "timeseries", "title": "Work Queue Retry Rate by Controller",
|
||||
"description": "Rate of items being re-queued after a failed reconciliation. A persistently high retry rate for a controller = it is encountering recurring errors on the same objects (e.g., API permission errors, webhook rejections, or resource conflicts).",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(name)(rate(workqueue_retries_total{job=~\".*controller-manager.*\"}[5m])))",
|
||||
"refId": "A", "legendFormat": "{{name}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 39 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 29, "type": "row", "title": "kube-scheduler", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 47 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30, "type": "timeseries", "title": "Scheduling Attempt Rate by Result",
|
||||
"description": "Outcomes of scheduling cycles per second. scheduled = pod successfully bound to a node. unschedulable = no node met the pod's constraints. error = scheduler internal failure (API error, timeout). Persistent unschedulable = cluster capacity or taints/affinity misconfiguration.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(result)(rate(scheduler_schedule_attempts_total[5m]))",
|
||||
"refId": "A", "legendFormat": "{{result}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "error" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 31, "type": "timeseries", "title": "Scheduling Latency — p50 / p95 / p99",
|
||||
"description": "Time from when a pod enters the active queue to when a binding decision is made (does not include bind API call time). Includes filter, score, and reserve plugin execution time. Spike = expensive affinity rules, large number of nodes, or slow extender webhooks.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.50, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "A", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "B", "legendFormat": "p95" },
|
||||
{ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le))", "refId": "C", "legendFormat": "p99" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 32, "type": "timeseries", "title": "Pending Pods by Queue",
|
||||
"description": "Pods waiting to be scheduled, split by internal queue. active = ready to be attempted now. backoff = recently failed, in exponential back-off. unschedulable = parked until cluster state changes. A growing unschedulable queue = systemic capacity or constraint problem.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(queue)(scheduler_pending_pods)",
|
||||
"refId": "A", "legendFormat": "{{queue}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "red", "value": 50 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "unschedulable" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "backoff" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "active" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 48 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 33, "type": "row", "title": "Process Resources — All Control Plane Components", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 55 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 34, "type": "timeseries", "title": "CPU Usage by Component",
|
||||
"description": "Rate of CPU seconds consumed by each control plane process. apiserver CPU spike = surge in request volume or list serialization. controller-manager CPU spike = reconcile storm. scheduler CPU spike = large node count with complex affinity.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*apiserver.*\"}[5m]))", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*controller-manager.*\"}[5m]))", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(rate(process_cpu_seconds_total{job=~\".*scheduler.*\"}[5m]))", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 35, "type": "timeseries", "title": "RSS Memory by Component",
|
||||
"description": "Resident set size of each control plane process. apiserver memory is dominated by the watch cache size and serialisation buffers. controller-manager memory = informer caches. Monotonically growing RSS without restarts = memory leak or unbounded cache growth.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(process_resident_memory_bytes{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 56 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 36, "type": "timeseries", "title": "Goroutines by Component",
|
||||
"description": "Number of live goroutines in each control plane process. Gradual upward drift = goroutine leak (often tied to unclosed watch streams or context leaks). A step-down = process restart. apiserver typically runs 200–600 goroutines; spikes above 1000 warrant investigation.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*apiserver.*\"})", "refId": "A", "legendFormat": "apiserver — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*controller-manager.*\"})", "refId": "B", "legendFormat": "controller-manager — {{job}}" },
|
||||
{ "expr": "sum by(job)(go_goroutines{job=~\".*scheduler.*\"})", "refId": "C", "legendFormat": "scheduler — {{job}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 56 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,741 +0,0 @@
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: okd-alerts-events
|
||||
namespace: observability
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: "grafana"
|
||||
json: |
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
# These are probably already created by rook-ceph operator, not sure, needs to validate.
|
||||
# in fact, 100% sure for the second one (rook-ceph-exporter)
|
||||
# i over-wrote the first one (rook-ceph-mgr) with what is here, it was probably already working
|
||||
# all what was missing was a label on the rook-ceph namespace to tell prometheus to look for monitors in this namespace
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-mgr
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This specific label is what tells OKD's Prometheus to pick this up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
# This matches your 'rook-ceph-mgr' service
|
||||
app: rook-ceph-mgr
|
||||
endpoints:
|
||||
- port: ""
|
||||
# The port name in your service is empty/integers, so we use targetPort
|
||||
targetPort: 9283
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: rook-ceph-exporter
|
||||
namespace: rook-ceph
|
||||
labels:
|
||||
# This label is required for OKD cluster-wide monitoring to pick it up
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
team: rook
|
||||
spec:
|
||||
endpoints:
|
||||
- honorLabels: true
|
||||
interval: 10s
|
||||
path: /metrics
|
||||
port: ceph-exporter-http-metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- rook-ceph
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rook-ceph-exporter
|
||||
rook_cluster: rook-ceph
|
||||
@@ -1,23 +0,0 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints", "pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: rook-ceph-metrics-viewer
|
||||
namespace: rook-ceph
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: rook-ceph-metrics-viewer
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: openshift-monitoring
|
||||
@@ -1,7 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: rook-ceph
|
||||
labels:
|
||||
# This is the critical label that allows OKD Prometheus to see the namespace
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
@@ -1,731 +0,0 @@
|
||||
{
|
||||
"title": "Alerts & Events — Active Problems",
|
||||
"uid": "okd-alerts-events",
|
||||
"schemaVersion": 36,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-3h", "to": "now" },
|
||||
"tags": ["okd", "alerts", "events"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "severity",
|
||||
"type": "custom",
|
||||
"label": "Severity Filter",
|
||||
"query": "critical,warning,info",
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"includeAll": true,
|
||||
"allValue": "critical|warning|info",
|
||||
"multi": false,
|
||||
"options": [
|
||||
{ "selected": true, "text": "All", "value": "$__all" },
|
||||
{ "selected": false, "text": "Critical", "value": "critical" },
|
||||
{ "selected": false, "text": "Warning", "value": "warning" },
|
||||
{ "selected": false, "text": "Info", "value": "info" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"label": "Namespace",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"query": { "query": "label_values(ALERTS{alertstate=\"firing\"}, namespace)", "refId": "A" },
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"multi": true,
|
||||
"sort": 1,
|
||||
"current": {},
|
||||
"options": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1, "type": "stat", "title": "Critical Alerts Firing",
|
||||
"description": "Alerting rule instances currently in the firing state with severity=\"critical\". Any non-zero value represents a breached SLO or infrastructure condition requiring immediate on-call response. The ALERTS metric is generated by Prometheus directly from your alerting rules — it reflects what Prometheus knows, before Alertmanager routing or silencing.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2, "type": "stat", "title": "Warning Alerts Firing",
|
||||
"description": "Firing alerts at severity=\"warning\". Warnings indicate a degraded or elevated-risk condition that has not yet crossed the critical threshold. A sustained or growing warning count often precedes a critical fire — treat them as early-warning signals, not background noise.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "orange", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3, "type": "stat", "title": "Info / Unclassified Alerts Firing",
|
||||
"description": "Firing alerts with severity=\"info\" or no severity label. These are informational and do not normally require immediate action. A sudden large jump may reveal noisy alerting rules generating alert fatigue — rules worth reviewing for threshold tuning or adding inhibition rules.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(ALERTS{alertstate=\"firing\",severity!~\"critical|warning\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "blue", "value": 1 },
|
||||
{ "color": "blue", "value": 25 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4, "type": "stat", "title": "Alerts Silenced (Suppressed)",
|
||||
"description": "Alerts currently matched by an active Alertmanager silence rule and therefore not routed to receivers. Silences are intentional during maintenance windows, but a large suppressed count outside of planned maintenance = an overly broad silence masking real problems. Zero silences when a maintenance window is active = the silence has expired or was misconfigured.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "sum(alertmanager_alerts{state=\"suppressed\"}) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5, "type": "stat", "title": "CrashLoopBackOff Pods",
|
||||
"description": "Container instances currently waiting in the CrashLoopBackOff state — the container crashed and Kubernetes is retrying with exponential back-off. Each instance is a pod that cannot stay running. Common root causes: OOM kill, bad entrypoint, missing Secret or ConfigMap, an unavailable init dependency, or a broken image layer.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 3 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6, "type": "stat", "title": "OOMKilled Containers",
|
||||
"description": "Containers whose most recent termination reason was OOMKilled. This is a current-state snapshot: a container that was OOMKilled, restarted, and is now Running will still appear here until its next termination occurs for a different reason. Non-zero and stable = recurring OOM, likely a workload memory leak or under-provisioned memory limit.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "orange", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 7, "type": "stat", "title": "NotReady Nodes",
|
||||
"description": "Nodes where the Ready condition is currently not True (False or Unknown). A NotReady node stops receiving new pod scheduling and, after the node eviction timeout (~5 min default), pods on it will be evicted. Control plane nodes going NotReady simultaneously = potential quorum loss. Any non-zero value is a tier-1 incident signal.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 8, "type": "stat", "title": "Degraded Cluster Operators (OKD)",
|
||||
"description": "OKD ClusterOperators currently reporting Degraded=True. Each ClusterOperator owns a core platform component — authentication, networking, image-registry, monitoring, ingress, storage, etc. A degraded operator means its managed component is impaired or unavailable. Zero is the only acceptable steady-state value outside of an active upgrade.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{ "expr": "count(cluster_operator_conditions{condition=\"Degraded\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]},
|
||||
"unit": "short", "noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
|
||||
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 9, "type": "row", "title": "Alert Overview", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10, "type": "timeseries", "title": "Firing Alert Count by Severity Over Time",
|
||||
"description": "Instantaneous count of firing ALERTS series grouped by severity over the selected window. A vertical rise = new alerting condition emerged. A horizontal plateau = a persistent, unresolved problem. A step-down = alert resolved or Prometheus rule evaluation stopped matching. Use the Severity Filter variable to narrow scope during triage.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "count by(severity)(ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{severity}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "critical" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "warning" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "info" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["max", "lastNotNull"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 11, "type": "timeseries", "title": "Alertmanager Notification Rate by Integration",
|
||||
"description": "Rate of notification delivery attempts from Alertmanager per second, split by integration type (slack, pagerduty, email, webhook, etc.). Solid lines = successful deliveries; dashed red lines = failed deliveries. A drop to zero on all integrations = Alertmanager is not processing or the cluster is completely quiet. Persistent failures on one integration = check that receiver's credentials or endpoint availability.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_total[5m]))", "refId": "A", "legendFormat": "✓ {{integration}}" },
|
||||
{ "expr": "sum by(integration)(rate(alertmanager_notifications_failed_total[5m]))", "refId": "B", "legendFormat": "✗ {{integration}}" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 3,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byFrameRefID", "options": "B" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [6, 4], "fill": "dash" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 12, "type": "bargauge", "title": "Longest-Firing Active Alerts",
|
||||
"description": "Duration (now - ALERTS_FOR_STATE timestamp) for each currently firing alert, sorted descending. Alerts at the top have been firing longest and are the most likely candidates for known-but-unresolved issues, stale firing conditions, or alerts that should have a silence applied. Red bars (> 2 hours) strongly suggest a problem that has been acknowledged but not resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sort_desc(time() - ALERTS_FOR_STATE{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"})",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{alertname}} · {{severity}} · {{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "orange", "value": 1800 },
|
||||
{ "color": "red", "value": 7200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true,
|
||||
"valueMode": "color"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 5 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 13, "type": "row", "title": "Active Firing Alerts — Full Detail", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 14, "type": "table", "title": "All Firing Alerts",
|
||||
"description": "Instant-query table of every currently firing alert visible to Prometheus, filtered by the Namespace and Severity variables above. Each row is one alert instance (unique label combination). The value column is omitted — by definition every row here is firing. Use the built-in column filter (funnel icon) to further narrow to a specific alertname, pod, or node. Columns are sparse: labels not defined in a given alert rule will show '—'.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "ALERTS{alertstate=\"firing\",severity=~\"$severity\",namespace=~\"$namespace\"}",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"alertstate": true,
|
||||
"__name__": true,
|
||||
"Value": true,
|
||||
"Time": true
|
||||
},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"severity": "Severity",
|
||||
"namespace": "Namespace",
|
||||
"pod": "Pod",
|
||||
"node": "Node",
|
||||
"container": "Container",
|
||||
"job": "Job",
|
||||
"service": "Service",
|
||||
"reason": "Reason",
|
||||
"instance": "Instance"
|
||||
},
|
||||
"indexByName": {
|
||||
"severity": 0,
|
||||
"alertname": 1,
|
||||
"namespace": 2,
|
||||
"pod": 3,
|
||||
"node": 4,
|
||||
"container": 5,
|
||||
"job": 6,
|
||||
"service": 7,
|
||||
"reason": 8,
|
||||
"instance": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 110 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"critical": { "text": "CRITICAL", "color": "dark-red", "index": 0 },
|
||||
"warning": { "text": "WARNING", "color": "dark-yellow", "index": 1 },
|
||||
"info": { "text": "INFO", "color": "dark-blue", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Alert Name" }, "properties": [{ "id": "custom.width", "value": 300 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 180 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 200 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 200 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Severity" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 14 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 15, "type": "row", "title": "Kubernetes Warning Events", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16, "type": "timeseries", "title": "Warning Event Rate by Reason",
|
||||
"description": "Rate of Kubernetes Warning-type events per second grouped by reason code. BackOff = container is CrashLooping. FailedScheduling = no node satisfies pod constraints. FailedMount = volume attachment or CSI failure. Evicted = kubelet evicted a pod due to memory or disk pressure. NodeNotReady = node lost contact. A spike in a single reason narrows the incident root-cause immediately without needing to read raw event logs. Requires kube-state-metrics with --resources=events.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(rate(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 17, "type": "bargauge", "title": "Warning Events — Top Namespaces (Accumulated Count)",
|
||||
"description": "Total accumulated Warning event count (the count field on the Kubernetes Event object) per namespace, showing the top 15 most active. A namespace dominating this chart is generating significantly more abnormal conditions than its peers, useful for identifying noisy tenants, misconfigured deployments, or namespaces experiencing a persistent infrastructure problem. Note this is the raw Event.count field — it resets if the event object is deleted and recreated.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(15, sum by(namespace)(kube_event_count{type=\"Warning\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 10 },
|
||||
{ "color": "orange", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 18, "type": "timeseries", "title": "Warning Events — Accumulated Count by Reason Over Time",
|
||||
"description": "Raw accumulated event count gauge over time, split by reason. Unlike the rate panel this shows total volume and slope simultaneously. A line that climbs steeply = events are occurring frequently right now. A line that plateaus = the condition causing that reason has stopped. A line that drops to zero = the event object was deleted and recreated or the condition fully resolved.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(reason)(kube_event_count{type=\"Warning\",namespace=~\"$namespace\"}))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{reason}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 8, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 19, "type": "row", "title": "Pod Problems", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20, "type": "timeseries", "title": "CrashLoopBackOff Pods by Namespace",
|
||||
"description": "Count of container instances in CrashLoopBackOff waiting state over time, broken down by namespace. A sudden rise in one namespace = a workload deployment is failing. A persistent baseline across many namespaces = a shared dependency (Secret, ConfigMap, network policy, or an upstream service) has become unavailable. Unlike restart rate, this panel shows the steady-state count of pods currently stuck — not flapping.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "sum by(namespace)(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\",namespace=~\"$namespace\"} == 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 15, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 0, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 21, "type": "timeseries", "title": "Container Restart Rate by Namespace",
|
||||
"description": "Rate of container restarts per second across all reasons (OOMKill, liveness probe failure, process exit) grouped by namespace. A namespace with a rising restart rate that has not yet entered CrashLoopBackOff is in the early failure window before the exponential back-off penalty kicks in. Cross-reference with the OOMKilled stat tile and the last-terminated-reason to separate crash types.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "topk(10, sum by(namespace)(rate(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[5m])))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0, "decimals": 4,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false }
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 8, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 22, "type": "timeseries", "title": "Pods by Problem Phase (Failed / Pending / Unknown)",
|
||||
"description": "Count of pods in Failed, Pending, or Unknown phase over time. Failed = container terminated with a non-zero exit code or was evicted and not rescheduled. Pending for more than a few minutes = scheduler unable to bind the pod (check FailedScheduling events, node capacity, and taint/toleration mismatches). Unknown = kubelet is not reporting to the apiserver, typically indicating a node network partition or kubelet crash.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{ "expr": "sum by(phase)(kube_pod_status_phase{phase=~\"Failed|Unknown\",namespace=~\"$namespace\"} == 1)", "refId": "A", "legendFormat": "{{phase}}" },
|
||||
{ "expr": "sum(kube_pod_status_phase{phase=\"Pending\",namespace=~\"$namespace\"} == 1)", "refId": "B", "legendFormat": "Pending" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short", "min": 0,
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false,
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"thresholds": { "mode": "absolute", "steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]}
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Pending" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
|
||||
{ "matcher": { "id": "byName", "options": "Unknown" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": { "mode": "multi", "sort": "desc" },
|
||||
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
},
|
||||
"gridPos": { "h": 7, "w": 8, "x": 16, "y": 36 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 23, "type": "row", "title": "Node & Cluster Operator Conditions", "collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 24, "type": "table", "title": "Node Condition Status Matrix",
|
||||
"description": "Instant snapshot of every active node condition across all nodes. Each row is one (node, condition, status) triple where value=1, meaning that combination is currently true. Ready=true is the normal healthy state; MemoryPressure=true, DiskPressure=true, PIDPressure=true, and NetworkUnavailable=true all indicate problem states that will affect pod scheduling on that node. Use the column filter to show only conditions where status=\"true\" and condition != \"Ready\" to isolate problems quickly.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [{
|
||||
"expr": "kube_node_status_condition == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true
|
||||
},
|
||||
"renameByName": {
|
||||
"node": "Node",
|
||||
"condition": "Condition",
|
||||
"status": "Status"
|
||||
},
|
||||
"indexByName": { "node": 0, "condition": 1, "status": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Status" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 90 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"true": { "text": "true", "color": "green", "index": 0 },
|
||||
"false": { "text": "false", "color": "dark-red", "index": 1 },
|
||||
"unknown": { "text": "unknown", "color": "dark-orange", "index": 2 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 190 },
|
||||
{ "id": "custom.displayMode", "value": "color-text" },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Ready": { "color": "green", "index": 0 },
|
||||
"MemoryPressure": { "color": "red", "index": 1 },
|
||||
"DiskPressure": { "color": "red", "index": 2 },
|
||||
"PIDPressure": { "color": "red", "index": 3 },
|
||||
"NetworkUnavailable": { "color": "red", "index": 4 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 230 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Node" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
|
||||
},
|
||||
|
||||
{
|
||||
"id": 25, "type": "table", "title": "Cluster Operator Conditions — Degraded & Progressing (OKD)",
|
||||
"description": "Shows only ClusterOperator conditions that indicate a problem state: Degraded=True (operator has failed to achieve its desired state) or Progressing=True (operator is actively reconciling — normal during upgrades but alarming in steady state). Operators not appearing in this table are healthy. The reason column gives the operator's own explanation for the condition, which maps directly to the relevant operator log stream and OpenShift runbook.",
|
||||
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Degraded\"} == 1",
|
||||
"refId": "A",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
},
|
||||
{
|
||||
"expr": "cluster_operator_conditions{condition=\"Progressing\"} == 1",
|
||||
"refId": "B",
|
||||
"instant": true,
|
||||
"legendFormat": ""
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{ "id": "labelsToFields", "options": { "mode": "columns" } },
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"__name__": true,
|
||||
"endpoint": true,
|
||||
"job": true,
|
||||
"service": true,
|
||||
"instance": true,
|
||||
"namespace": true
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Operator",
|
||||
"condition": "Condition",
|
||||
"reason": "Reason"
|
||||
},
|
||||
"indexByName": { "name": 0, "condition": 1, "reason": 2 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "left", "filterable": true },
|
||||
"noValue": "—"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Condition" },
|
||||
"properties": [
|
||||
{ "id": "custom.displayMode", "value": "color-background" },
|
||||
{ "id": "custom.width", "value": 140 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"Degraded": { "text": "Degraded", "color": "dark-red", "index": 0 },
|
||||
"Progressing": { "text": "Progressing", "color": "dark-yellow", "index": 1 }
|
||||
}
|
||||
}]
|
||||
}
|
||||
]
|
||||
},
|
||||
{ "matcher": { "id": "byName", "options": "Operator" }, "properties": [{ "id": "custom.width", "value": 240 }] },
|
||||
{ "matcher": { "id": "byName", "options": "Reason" }, "properties": [{ "id": "custom.width", "value": 220 }] }
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"sortBy": [{ "desc": false, "displayName": "Condition" }],
|
||||
"footer": { "show": false }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user