Compare commits

..

15 Commits

Author SHA1 Message Date
c4dd0b0cf2 chore: cleaned up some dead code, comments, etc
All checks were successful
Run Check Script / check (pull_request) Successful in 1m39s
2026-02-26 16:06:14 -05:00
b14b41d172 refactor: prometheus alert sender
All checks were successful
Run Check Script / check (pull_request) Successful in 1m40s
2026-02-26 15:10:28 -05:00
5e861cfc6d refactor: skeleton structure for grafana observability
All checks were successful
Run Check Script / check (pull_request) Successful in 1m36s
2026-02-26 14:38:28 -05:00
4fad077eb4 refactor(kubeprometheus): implemented Observability for KubePrometheus
All checks were successful
Run Check Script / check (pull_request) Successful in 1m38s
2026-02-26 13:07:28 -05:00
d80561e326 wip(kubeprometheus): created base scores for kubeprometheus alert receivers, scrape_tarets and rules
Some checks failed
Run Check Script / check (pull_request) Failing after 37s
2026-02-25 16:16:33 -05:00
621aed4903 wip: refactoring kubeprometheus
Some checks failed
Run Check Script / check (pull_request) Failing after 10m18s
2026-02-25 15:48:12 -05:00
e68426cc3d feat: added implentation for prometheus node exporter external scrape target for openshift cluster alert sender. added alerting rule to return high http error rate
Some checks failed
Run Check Script / check (pull_request) Failing after 39s
2026-02-25 14:54:10 -05:00
0c1c8daf13 wip: working alert rule for okd
Some checks failed
Run Check Script / check (pull_request) Failing after 1m31s
2026-02-24 16:13:30 -05:00
4b5e3a52a1 feat: working example of enabling and adding an alert receiver for okd_cluster_alerts
All checks were successful
Run Check Script / check (pull_request) Successful in 1m42s
2026-02-24 11:14:47 -05:00
c54936d19f fix: added check to verify if cluster monitoring is enabled
Some checks failed
Run Check Script / check (pull_request) Failing after 40s
2026-02-23 16:07:52 -05:00
699822af74 chore: reorganized file location
All checks were successful
Run Check Script / check (pull_request) Successful in 2m14s
2026-02-23 15:03:55 -05:00
554c94f5a9 wip: compiles
All checks were successful
Run Check Script / check (pull_request) Successful in 2m9s
2026-02-23 14:48:05 -05:00
836db9e6b1 wip: refactored redhat cluster observability operator
Some checks failed
Run Check Script / check (pull_request) Failing after 41s
2026-02-23 13:18:40 -05:00
bc6a41d40c wip: removed use of installable trait, added all installation and ensure ready functions to the trait monitor, first impl of AlertReceiver for OpenshiftClusterAlertSender
Some checks failed
Run Check Script / check (pull_request) Failing after -22s
2026-02-20 12:49:55 -05:00
8d446ec2e4 wip: refactoring monitoring
Some checks failed
Run Check Script / check (pull_request) Failing after -14s
2026-02-19 16:25:59 -05:00
226 changed files with 8248 additions and 23415 deletions

2600
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@
resolver = "2"
members = [
"private_repos/*",
"examples/*",
"harmony",
"harmony_types",
"harmony_macros",
@@ -16,9 +17,9 @@ members = [
"harmony_secret_derive",
"harmony_secret",
"adr/agent_discovery/mdns",
"brocade",
"harmony_agent",
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
"brocade",
"harmony_agent",
"harmony_agent/deploy",
]
[workspace.package]
@@ -37,8 +38,6 @@ tokio = { version = "1.40", features = [
"macros",
"rt-multi-thread",
] }
tokio-retry = "0.3.0"
tokio-util = "0.7.15"
cidr = { features = ["serde"], version = "0.2" }
russh = "0.45"
russh-keys = "0.45"

View File

@@ -1,7 +1,8 @@
use super::BrocadeClient;
use crate::{
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
shell::BrocadeShell,
};
use async_trait::async_trait;

View File

@@ -8,7 +8,7 @@ use regex::Regex;
use crate::{
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
parse_brocade_mac_address, shell::BrocadeShell,
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
};
#[derive(Debug)]

View File

@@ -1,8 +1,8 @@
use harmony::{
inventory::Inventory,
modules::cert_manager::{
capability::CertificateManagementConfig, score_certificate::CertificateScore,
score_issuer::CertificateIssuerScore,
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
},
topology::K8sAnywhereTopology,
};

View File

@@ -1,16 +0,0 @@
[workspace]
[package]
name = "example-cluster-dashboards"
edition = "2021"
version = "0.1.0"
license = "GNU AGPL v3"
publish = false
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
tokio = { version = "1.40", features = ["macros", "rt-multi-thread"] }
log = "0.4"
env_logger = "0.11"

View File

@@ -1,21 +0,0 @@
use harmony::{
inventory::Inventory,
modules::monitoring::cluster_dashboards::ClusterDashboardsScore,
topology::K8sAnywhereTopology,
};
#[tokio::main]
async fn main() {
harmony_cli::cli_logger::init();
let cluster_dashboards_score = ClusterDashboardsScore::default();
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(cluster_dashboards_score)],
None,
)
.await
.unwrap();
}

View File

@@ -10,10 +10,9 @@ publish = false
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
harmony-k8s = { path = "../../harmony-k8s" }
cidr.workspace = true
tokio.workspace = true
harmony_macros = { path = "../../harmony_macros" }
log.workspace = true
env_logger.workspace = true
url.workspace = true

View File

@@ -1,6 +1,6 @@
use std::time::Duration;
use harmony_k8s::{DrainOptions, K8sClient};
use harmony::topology::k8s::{DrainOptions, K8sClient};
use log::{info, trace};
#[tokio::main]

View File

@@ -10,10 +10,9 @@ publish = false
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
harmony-k8s = { path = "../../harmony-k8s" }
cidr.workspace = true
tokio.workspace = true
harmony_macros = { path = "../../harmony_macros" }
log.workspace = true
env_logger.workspace = true
url.workspace = true

View File

@@ -1,4 +1,4 @@
use harmony_k8s::{K8sClient, NodeFile};
use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile};
use log::{info, trace};
#[tokio::main]

View File

@@ -1,37 +1,45 @@
use std::collections::HashMap;
use std::{
collections::HashMap,
sync::{Arc, Mutex},
};
use harmony::{
inventory::Inventory,
modules::{
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{
HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
ServiceMonitorEndpoint,
modules::monitoring::{
alert_channel::discord_alert_channel::DiscordReceiver,
alert_rule::{
alerts::{
infra::dell_server::{
alert_global_storage_status_critical,
alert_global_storage_status_non_recoverable,
global_storage_status_degraded_non_critical,
},
k8s::pvc::high_pvc_fill_rate_over_two_days,
},
prometheus_alert_rule::AlertManagerRuleGroup,
},
prometheus::alerts::{
infra::dell_server::{
alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
global_storage_status_degraded_non_critical,
kube_prometheus::{
helm::config::KubePrometheusConfig,
kube_prometheus_alerting_score::KubePrometheusAlertingScore,
types::{
HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
ServiceMonitorEndpoint,
},
k8s::pvc::high_pvc_fill_rate_over_two_days,
},
},
topology::K8sAnywhereTopology,
topology::{K8sAnywhereTopology, monitoring::AlertRoute},
};
use harmony_types::{k8s_name::K8sName, net::Url};
#[tokio::main]
async fn main() {
let discord_receiver = DiscordWebhook {
name: K8sName("test-discord".to_string()),
let receiver_name = "test-discord".to_string();
let discord_receiver = DiscordReceiver {
name: receiver_name.clone(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
selectors: vec![],
route: AlertRoute {
..AlertRoute::default(receiver_name)
},
};
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
@@ -70,10 +78,15 @@ async fn main() {
endpoints: vec![service_monitor_endpoint],
..Default::default()
};
let alerting_score = HelmPrometheusAlertingScore {
let config = Arc::new(Mutex::new(KubePrometheusConfig::new()));
let alerting_score = KubePrometheusAlertingScore {
receivers: vec![Box::new(discord_receiver)],
rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
service_monitors: vec![service_monitor],
scrape_targets: None,
config,
};
harmony_cli::run(

View File

@@ -1,24 +1,32 @@
use std::{collections::HashMap, str::FromStr};
use std::{
collections::HashMap,
str::FromStr,
sync::{Arc, Mutex},
};
use harmony::{
inventory::Inventory,
modules::{
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
alert_channel::discord_alert_channel::DiscordReceiver,
alert_rule::{
alerts::k8s::pvc::high_pvc_fill_rate_over_two_days,
prometheus_alert_rule::AlertManagerRuleGroup,
},
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
helm::config::KubePrometheusConfig,
kube_prometheus_alerting_score::KubePrometheusAlertingScore,
types::{
HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
ServiceMonitorEndpoint,
},
},
},
prometheus::alerts::k8s::pvc::high_pvc_fill_rate_over_two_days,
tenant::TenantScore,
},
topology::{
K8sAnywhereTopology,
monitoring::AlertRoute,
tenant::{ResourceLimits, TenantConfig, TenantNetworkPolicy},
},
};
@@ -42,10 +50,13 @@ async fn main() {
},
};
let discord_receiver = DiscordWebhook {
name: K8sName("test-discord".to_string()),
let receiver_name = "test-discord".to_string();
let discord_receiver = DiscordReceiver {
name: receiver_name.clone(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
selectors: vec![],
route: AlertRoute {
..AlertRoute::default(receiver_name)
},
};
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
@@ -74,10 +85,14 @@ async fn main() {
..Default::default()
};
let alerting_score = HelmPrometheusAlertingScore {
let config = Arc::new(Mutex::new(KubePrometheusConfig::new()));
let alerting_score = KubePrometheusAlertingScore {
receivers: vec![Box::new(discord_receiver)],
rules: vec![Box::new(additional_rules)],
service_monitors: vec![service_monitor],
scrape_targets: None,
config,
};
harmony_cli::run(

View File

@@ -1,16 +0,0 @@
[package]
name = "example-node-health"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
publish = false
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
tokio = { workspace = true }
harmony_macros = { path = "../../harmony_macros" }
log = { workspace = true }
env_logger = { workspace = true }

View File

@@ -1,17 +0,0 @@
use harmony::{
inventory::Inventory, modules::node_health::NodeHealthScore, topology::K8sAnywhereTopology,
};
#[tokio::main]
async fn main() {
let node_health = NodeHealthScore {};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(node_health)],
None,
)
.await
.unwrap();
}

View File

@@ -1,35 +1,64 @@
use std::collections::HashMap;
use harmony::{
inventory::Inventory,
modules::monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
okd::cluster_monitoring::OpenshiftClusterAlertScore,
alert_channel::discord_alert_channel::DiscordReceiver,
alert_rule::{
alerts::{
infra::opnsense::high_http_error_rate, k8s::pvc::high_pvc_fill_rate_over_two_days,
},
prometheus_alert_rule::AlertManagerRuleGroup,
},
okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
},
topology::{
K8sAnywhereTopology,
monitoring::{AlertMatcher, AlertRoute, MatchOp},
},
topology::K8sAnywhereTopology,
};
use harmony_macros::hurl;
use harmony_types::k8s_name::K8sName;
use harmony_macros::{hurl, ip};
#[tokio::main]
async fn main() {
let mut sel = HashMap::new();
sel.insert(
"openshift_io_alert_source".to_string(),
"platform".to_string(),
);
let mut sel2 = HashMap::new();
sel2.insert("openshift_io_alert_source".to_string(), "".to_string());
let selectors = vec![sel, sel2];
let platform_matcher = AlertMatcher {
label: "prometheus".to_string(),
operator: MatchOp::Eq,
value: "openshift-monitoring/k8s".to_string(),
};
let severity = AlertMatcher {
label: "severity".to_string(),
operator: MatchOp::Eq,
value: "critical".to_string(),
};
let high_http_error_rate = high_http_error_rate();
let additional_rules = AlertManagerRuleGroup::new("", vec![high_http_error_rate]);
let scrape_target = PrometheusNodeExporter {
job_name: "firewall".to_string(),
metrics_path: "/metrics".to_string(),
listen_address: ip!("127.0.0.1"),
port: 9100,
..Default::default()
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(OpenshiftClusterAlertScore {
receivers: vec![Box::new(DiscordWebhook {
name: K8sName("wills-discord-webhook-example".to_string()),
url: hurl!("https://something.io"),
selectors: selectors,
receivers: vec![Box::new(DiscordReceiver {
name: "crit-wills-discord-channel-example".to_string(),
url: hurl!("https://test.io"),
route: AlertRoute {
matchers: vec![severity],
..AlertRoute::default("crit-wills-discord-channel-example".to_string())
},
})],
sender: harmony::modules::monitoring::okd::OpenshiftClusterAlertSender,
rules: vec![Box::new(additional_rules)],
scrape_targets: Some(vec![Box::new(scrape_target)]),
})],
None,
)

View File

@@ -6,10 +6,7 @@ use harmony::{
data::{FileContent, FilePath},
modules::{
inventory::HarmonyDiscoveryStrategy,
okd::{
installation::OKDInstallationPipeline, ipxe::OKDIpxeScore,
load_balancer::OKDLoadBalancerScore,
},
okd::{installation::OKDInstallationPipeline, ipxe::OKDIpxeScore},
},
score::Score,
topology::HAClusterTopology,
@@ -35,7 +32,6 @@ async fn main() {
scores
.append(&mut OKDInstallationPipeline::get_all_scores(HarmonyDiscoveryStrategy::MDNS).await);
scores.push(Box::new(OKDLoadBalancerScore::new(&topology)));
harmony_cli::run(inventory, topology, scores, None)
.await
.unwrap();

View File

@@ -1,13 +1,63 @@
use std::str::FromStr;
use harmony::{
inventory::Inventory, modules::openbao::OpenbaoScore, topology::K8sAnywhereTopology,
inventory::Inventory,
modules::helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
topology::K8sAnywhereTopology,
};
use harmony_macros::hurl;
#[tokio::main]
async fn main() {
let openbao = OpenbaoScore {
host: "openbao.sebastien.sto1.nationtech.io".to_string(),
let values_yaml = Some(
r#"server:
standalone:
enabled: true
config: |
listener "tcp" {
tls_disable = true
address = "[::]:8200"
cluster_address = "[::]:8201"
}
storage "file" {
path = "/openbao/data"
}
service:
enabled: true
dataStorage:
enabled: true
size: 10Gi
storageClass: null
accessMode: ReadWriteOnce
auditStorage:
enabled: true
size: 10Gi
storageClass: null
accessMode: ReadWriteOnce"#
.to_string(),
);
let openbao = HelmChartScore {
namespace: Some(NonBlankString::from_str("openbao").unwrap()),
release_name: NonBlankString::from_str("openbao").unwrap(),
chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
chart_version: None,
values_overrides: None,
values_yaml,
create_namespace: true,
install_only: true,
repository: Some(HelmRepository::new(
"openbao".to_string(),
hurl!("https://openbao.github.io/openbao-helm"),
true,
)),
};
// TODO exec pod commands to initialize secret store if not already done
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),

View File

@@ -1,3 +1,5 @@
use std::str::FromStr;
use harmony::{
inventory::Inventory,
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
@@ -7,7 +9,7 @@ use harmony::{
#[tokio::main]
async fn main() {
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
let cnpg_operator = CloudNativePgOperatorScore::default_openshift();
let cnpg_operator = CloudNativePgOperatorScore::default();
harmony_cli::run(
Inventory::autoload(),

View File

@@ -1,13 +1,22 @@
use std::sync::Arc;
use std::{
net::{IpAddr, Ipv4Addr},
sync::Arc,
};
use async_trait::async_trait;
use cidr::Ipv4Cidr;
use harmony::{
executors::ExecutorError,
hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
infra::opnsense::OPNSenseManagementInterface,
inventory::Inventory,
modules::opnsense::node_exporter::NodeExporterScore,
topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
topology::{
HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
UnmanagedRouter, node_exporter::NodeExporter,
},
};
use harmony_macros::ip;
use harmony_macros::{ip, ipv4, mac_address};
#[derive(Debug)]
struct OpnSenseTopology {

View File

@@ -1,7 +1,8 @@
use harmony::{
inventory::Inventory,
modules::postgresql::{
PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
capability::PostgreSQLConfig,
},
topology::K8sAnywhereTopology,
};

View File

@@ -1,4 +1,4 @@
use std::{path::PathBuf, sync::Arc};
use std::{collections::HashMap, path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,
@@ -6,9 +6,9 @@ use harmony::{
application::{
ApplicationScore, RustWebFramework, RustWebapp, features::rhob_monitoring::Monitoring,
},
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
monitoring::alert_channel::discord_alert_channel::DiscordReceiver,
},
topology::K8sAnywhereTopology,
topology::{K8sAnywhereTopology, monitoring::AlertRoute},
};
use harmony_types::{k8s_name::K8sName, net::Url};
@@ -22,18 +22,21 @@ async fn main() {
service_port: 3000,
});
let discord_receiver = DiscordWebhook {
name: K8sName("test-discord".to_string()),
let receiver_name = "test-discord".to_string();
let discord_receiver = DiscordReceiver {
name: receiver_name.clone(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
selectors: vec![],
route: AlertRoute {
..AlertRoute::default(receiver_name)
},
};
let app = ApplicationScore {
features: vec![
Box::new(Monitoring {
application: application.clone(),
alert_receiver: vec![Box::new(discord_receiver)],
}),
// Box::new(Monitoring {
// application: application.clone(),
// alert_receiver: vec![Box::new(discord_receiver)],
// }),
// TODO add backups, multisite ha, etc
],
application,

View File

@@ -1,4 +1,4 @@
use std::{path::PathBuf, sync::Arc};
use std::{collections::HashMap, path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,
@@ -8,13 +8,13 @@ use harmony::{
features::{Monitoring, PackagingDeployment},
},
monitoring::alert_channel::{
discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
discord_alert_channel::DiscordReceiver, webhook_receiver::WebhookReceiver,
},
},
topology::K8sAnywhereTopology,
topology::{K8sAnywhereTopology, monitoring::AlertRoute},
};
use harmony_macros::hurl;
use harmony_types::k8s_name::K8sName;
use harmony_types::{k8s_name::K8sName, net::Url};
#[tokio::main]
async fn main() {
@@ -26,10 +26,13 @@ async fn main() {
service_port: 3000,
});
let discord_receiver = DiscordWebhook {
name: K8sName("test-discord".to_string()),
url: hurl!("https://discord.doesnt.exist.com"),
selectors: vec![],
let receiver_name = "test-discord".to_string();
let discord_receiver = DiscordReceiver {
name: receiver_name.clone(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
route: AlertRoute {
..AlertRoute::default(receiver_name)
},
};
let webhook_receiver = WebhookReceiver {
@@ -42,10 +45,10 @@ async fn main() {
Box::new(PackagingDeployment {
application: application.clone(),
}),
Box::new(Monitoring {
application: application.clone(),
alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
}),
// Box::new(Monitoring {
// application: application.clone(),
// alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
// }),
// TODO add backups, multisite ha, etc
],
application,

View File

@@ -1,11 +1,8 @@
use harmony::{
inventory::Inventory,
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{Monitoring, PackagingDeployment},
},
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
modules::application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{Monitoring, PackagingDeployment},
},
topology::K8sAnywhereTopology,
};
@@ -30,14 +27,14 @@ async fn main() {
Box::new(PackagingDeployment {
application: application.clone(),
}),
Box::new(Monitoring {
application: application.clone(),
alert_receiver: vec![Box::new(DiscordWebhook {
name: K8sName("test-discord".to_string()),
url: hurl!("https://discord.doesnt.exist.com"),
selectors: vec![],
})],
}),
// Box::new(Monitoring {
// application: application.clone(),
// alert_receiver: vec![Box::new(DiscordWebhook {
// name: K8sName("test-discord".to_string()),
// url: hurl!("https://discord.doesnt.exist.com"),
// selectors: vec![],
// })],
// }),
],
application,
};

View File

@@ -44,7 +44,6 @@ fn build_large_score() -> LoadBalancerScore {
],
listening_port: SocketAddr::V4(SocketAddrV4::new(ipv4!("192.168.0.0"), 49387)),
health_check: Some(HealthCheck::HTTP(
Some(1993),
"/some_long_ass_path_to_see_how_it_is_displayed_but_it_has_to_be_even_longer"
.to_string(),
HttpMethod::GET,

View File

@@ -1,14 +0,0 @@
[package]
name = "example-zitadel"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_macros = { path = "../../harmony_macros" }
harmony_types = { path = "../../harmony_types" }
tokio.workspace = true
url.workspace = true

View File

@@ -1,20 +0,0 @@
use harmony::{
inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
};
#[tokio::main]
async fn main() {
let zitadel = ZitadelScore {
host: "sso.sto1.nationtech.io".to_string(),
zitadel_version: "v4.12.1".to_string(),
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(zitadel)],
None,
)
.await
.unwrap();
}

Binary file not shown.

View File

@@ -1,23 +0,0 @@
[package]
name = "harmony-k8s"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
kube.workspace = true
k8s-openapi.workspace = true
tokio.workspace = true
tokio-retry.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_yaml.workspace = true
log.workspace = true
similar.workspace = true
reqwest.workspace = true
url.workspace = true
inquire.workspace = true
[dev-dependencies]
pretty_assertions.workspace = true

View File

@@ -1,593 +0,0 @@
use kube::{
Client, Error, Resource,
api::{
Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
ResourceExt,
},
core::ErrorResponse,
discovery::Scope,
error::DiscoveryError,
};
use log::{debug, error, trace, warn};
use serde::{Serialize, de::DeserializeOwned};
use serde_json::Value;
use similar::TextDiff;
use url::Url;
use crate::client::K8sClient;
use crate::helper;
use crate::types::WriteMode;
/// The field-manager token sent with every server-side apply request.
pub const FIELD_MANAGER: &str = "harmony-k8s";
// ── Private helpers ──────────────────────────────────────────────────────────
/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
.map_err(Error::SerdeError)
}
/// Fetch the current resource, display a unified diff against `payload`, and
/// return `()`. All output goes to stdout (same behaviour as before).
///
/// A 404 is treated as "resource would be created" — not an error.
async fn show_dry_run<T: Serialize>(
api: &Api<DynamicObject>,
name: &str,
payload: &T,
) -> Result<(), Error> {
let new_yaml = serde_yaml::to_string(payload)
.unwrap_or_else(|_| "Failed to serialize new resource".to_string());
match api.get(name).await {
Ok(current) => {
println!("\nDry-run for resource: '{name}'");
let mut current_val = serde_yaml::to_value(&current).unwrap_or(serde_yaml::Value::Null);
if let Some(map) = current_val.as_mapping_mut() {
map.remove(&serde_yaml::Value::String("status".to_string()));
}
let current_yaml = serde_yaml::to_string(&current_val)
.unwrap_or_else(|_| "Failed to serialize current resource".to_string());
if current_yaml == new_yaml {
println!("No changes detected.");
} else {
println!("Changes detected:");
let diff = TextDiff::from_lines(&current_yaml, &new_yaml);
for change in diff.iter_all_changes() {
let sign = match change.tag() {
similar::ChangeTag::Delete => "-",
similar::ChangeTag::Insert => "+",
similar::ChangeTag::Equal => " ",
};
print!("{sign}{change}");
}
}
Ok(())
}
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
println!("\nDry-run for new resource: '{name}'");
println!("Resource does not exist. Would be created:");
for line in new_yaml.lines() {
println!("+{line}");
}
Ok(())
}
Err(e) => {
error!("Failed to fetch resource '{name}' for dry-run: {e}");
Err(e)
}
}
}
/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
async fn do_apply<T: Serialize + std::fmt::Debug>(
api: &Api<DynamicObject>,
name: &str,
payload: &T,
patch_params: &PatchParams,
write_mode: &WriteMode,
) -> Result<DynamicObject, Error> {
match write_mode {
WriteMode::CreateOrUpdate => {
// TODO refactor this arm to perform self.update and if fail with 404 self.create
// This will avoid the repetition of the api.patch and api.create calls within this
// function body. This makes the code more maintainable
match api.patch(name, patch_params, &Patch::Apply(payload)).await {
Ok(obj) => Ok(obj),
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
debug!("Resource '{name}' not found via SSA, falling back to POST");
let dyn_obj = to_dynamic(payload)?;
api.create(&PostParams::default(), &dyn_obj)
.await
.map_err(|e| {
error!("Failed to create '{name}': {e}");
e
})
}
Err(e) => {
error!("Failed to apply '{name}': {e}");
Err(e)
}
}
}
WriteMode::Create => {
let dyn_obj = to_dynamic(payload)?;
api.create(&PostParams::default(), &dyn_obj)
.await
.map_err(|e| {
error!("Failed to create '{name}': {e}");
e
})
}
WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
Ok(obj) => Ok(obj),
Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
code: 404,
message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
reason: "NotFound".to_string(),
status: "Failure".to_string(),
})),
Err(e) => {
error!("Failed to update '{name}': {e}");
Err(e)
}
},
}
}
// ── Public API ───────────────────────────────────────────────────────────────
impl K8sClient {
/// Server-side apply: create if absent, update if present.
/// Equivalent to `kubectl apply`.
pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
.await
}
/// POST only — returns an error if the resource already exists.
pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::Create)
.await
}
/// Server-side apply only — returns an error if the resource does not exist.
pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
self.apply_with_strategy(resource, namespace, WriteMode::Update)
.await
}
pub async fn apply_with_strategy<K>(
&self,
resource: &K,
namespace: Option<&str>,
write_mode: WriteMode,
) -> Result<K, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
debug!(
"apply_with_strategy: {:?} ns={:?}",
resource.meta().name,
namespace
);
trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
let dyntype = K::DynamicType::default();
let gvk = GroupVersionKind {
group: K::group(&dyntype).to_string(),
version: K::version(&dyntype).to_string(),
kind: K::kind(&dyntype).to_string(),
};
let discovery = self.discovery().await?;
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Cannot resolve GVK: {gvk:?}"
)))
})?;
let effective_ns = if caps.scope == Scope::Cluster {
None
} else {
namespace.or_else(|| resource.meta().namespace.as_deref())
};
let api: Api<DynamicObject> =
get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
let name = resource
.meta()
.name
.as_deref()
.expect("Kubernetes resource must have a name");
if self.dry_run {
show_dry_run(&api, name, resource).await?;
return Ok(resource.clone());
}
let patch_params = PatchParams::apply(FIELD_MANAGER);
do_apply(&api, name, resource, &patch_params, &write_mode)
.await
.and_then(helper::dyn_to_typed)
}
/// Applies resources in order, one at a time
pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
<K as Resource>::DynamicType: Default,
{
let mut result = Vec::new();
for r in resources.iter() {
let res = self.apply(r, ns).await;
if res.is_err() {
// NOTE: this may log sensitive data; downgrade to debug if needed.
warn!(
"Failed to apply k8s resource: {}",
serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
);
}
result.push(res?);
}
Ok(result)
}
/// Apply a [`DynamicObject`] resource using server-side apply.
pub async fn apply_dynamic(
&self,
resource: &DynamicObject,
namespace: Option<&str>,
force_conflicts: bool,
) -> Result<DynamicObject, Error> {
trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
let discovery = self.discovery().await?;
let type_meta = resource.types.as_ref().ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(
"DynamicObject must have types (apiVersion and kind)".to_string(),
))
})?;
let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
Error::BuildRequest(kube::core::request::Error::Validation(format!(
"Invalid GVK in DynamicObject: {type_meta:?}"
)))
})?;
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Cannot resolve GVK: {gvk:?}"
)))
})?;
let effective_ns = if caps.scope == Scope::Cluster {
None
} else {
namespace.or_else(|| resource.metadata.namespace.as_deref())
};
let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
let name = resource.metadata.name.as_deref().ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(
"DynamicObject must have metadata.name".to_string(),
))
})?;
debug!(
"apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
resource.types.as_ref().map(|t| &t.kind),
);
// NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
// pasting the dry_run and some more logic
if self.dry_run {
show_dry_run(&api, name, resource).await?;
return Ok(resource.clone());
}
let mut patch_params = PatchParams::apply(FIELD_MANAGER);
patch_params.force = force_conflicts;
do_apply(
&api,
name,
resource,
&patch_params,
&WriteMode::CreateOrUpdate,
)
.await
}
pub async fn apply_dynamic_many(
&self,
resources: &[DynamicObject],
namespace: Option<&str>,
force_conflicts: bool,
) -> Result<Vec<DynamicObject>, Error> {
let mut result = Vec::new();
for r in resources.iter() {
result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
}
Ok(result)
}
pub async fn apply_yaml_many(
&self,
#[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
ns: Option<&str>,
) -> Result<(), Error> {
for y in yaml.iter() {
self.apply_yaml(y, ns).await?;
}
Ok(())
}
pub async fn apply_yaml(
&self,
yaml: &serde_yaml::Value,
ns: Option<&str>,
) -> Result<(), Error> {
// NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
// apply_dynamic instead of reimplementing api interactions?
let obj: DynamicObject =
serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
let kind = yaml["kind"].as_str().expect("missing kind");
let mut it = api_version.splitn(2, '/');
let first = it.next().unwrap();
let (g, v) = match it.next() {
Some(second) => (first, second),
None => ("", first),
};
let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
let namespace = ns.unwrap_or_else(|| {
obj.metadata
.namespace
.as_deref()
.expect("YAML must have a namespace when ns is not provided")
});
let api: Api<DynamicObject> =
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
println!("Applying '{name}' in namespace '{namespace}'...");
let patch_params = PatchParams::apply(FIELD_MANAGER);
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
println!("Successfully applied '{}'.", result.name_any());
Ok(())
}
/// Equivalent to `kubectl apply -f <url>`.
pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
let patch_params = PatchParams::apply(FIELD_MANAGER);
let discovery = self.discovery().await?;
let yaml = reqwest::get(url)
.await
.expect("Could not fetch URL")
.text()
.await
.expect("Could not read response body");
for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
let obj: DynamicObject =
serde_yaml::from_value(doc).expect("YAML document is not a valid object");
let namespace = obj.metadata.namespace.as_deref().or(ns);
let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
let gvk =
GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
let name = obj.name_any();
if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
trace!(
"Applying {}:\n{}",
gvk.kind,
serde_yaml::to_string(&obj).unwrap_or_default()
);
let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
debug!("Applied {} '{name}'", gvk.kind);
} else {
warn!("Skipping document with unknown GVK: {gvk:?}");
}
}
Ok(())
}
/// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
pub(crate) fn get_api_for_dynamic_object(
&self,
object: &DynamicObject,
ns: Option<&str>,
) -> Result<Api<DynamicObject>, Error> {
let ar = object
.types
.as_ref()
.and_then(|t| {
let parts: Vec<&str> = t.api_version.split('/').collect();
match parts.as_slice() {
[version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
"", version, &t.kind,
))),
[group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
group, version, &t.kind,
))),
_ => None,
}
})
.ok_or_else(|| {
Error::BuildRequest(kube::core::request::Error::Validation(format!(
"Invalid apiVersion in DynamicObject: {object:#?}"
)))
})?;
Ok(match ns {
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
None => Api::default_namespaced_with(self.client.clone(), &ar),
})
}
}
// ── Free functions ───────────────────────────────────────────────────────────
pub(crate) fn get_dynamic_api(
resource: kube::api::ApiResource,
capabilities: kube::discovery::ApiCapabilities,
client: Client,
ns: Option<&str>,
all: bool,
) -> Api<DynamicObject> {
if capabilities.scope == Scope::Cluster || all {
Api::all_with(client, &resource)
} else if let Some(namespace) = ns {
Api::namespaced_with(client, namespace, &resource)
} else {
Api::default_namespaced_with(client, &resource)
}
}
pub(crate) fn multidoc_deserialize(
data: &str,
) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
use serde::Deserialize;
let mut docs = vec![];
for de in serde_yaml::Deserializer::from_str(data) {
docs.push(serde_yaml::Value::deserialize(de)?);
}
Ok(docs)
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod apply_tests {
use std::collections::BTreeMap;
use std::time::{SystemTime, UNIX_EPOCH};
use k8s_openapi::api::core::v1::ConfigMap;
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::api::{DeleteParams, TypeMeta};
use super::*;
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_creates_new_configmap() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-cm-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
..Default::default()
};
assert!(client.apply(&cm, Some(ns)).await.is_ok());
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_is_idempotent() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-idem-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
..Default::default()
};
assert!(
client.apply(&cm, Some(ns)).await.is_ok(),
"first apply failed"
);
assert!(
client.apply(&cm, Some(ns)).await.is_ok(),
"second apply failed (not idempotent)"
);
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
#[tokio::test]
#[ignore = "requires kubernetes cluster"]
async fn apply_dynamic_creates_new_resource() {
let client = K8sClient::try_default().await.unwrap();
let ns = "default";
let name = format!(
"test-dyn-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis()
);
let obj = DynamicObject {
types: Some(TypeMeta {
api_version: "v1".to_string(),
kind: "ConfigMap".to_string(),
}),
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: serde_json::json!({}),
};
let result = client.apply_dynamic(&obj, Some(ns), false).await;
assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
let _ = api.delete(&name, &DeleteParams::default()).await;
}
}

View File

@@ -1,99 +0,0 @@
use std::sync::Arc;
use kube::config::{KubeConfigOptions, Kubeconfig};
use kube::{Client, Config, Discovery, Error};
use log::error;
use serde::Serialize;
use tokio::sync::OnceCell;
use crate::types::KubernetesDistribution;
// TODO not cool, should use a proper configuration mechanism
// cli arg, env var, config file
fn read_dry_run_from_env() -> bool {
std::env::var("DRY_RUN")
.map(|v| v == "true" || v == "1")
.unwrap_or(false)
}
#[derive(Clone)]
pub struct K8sClient {
pub(crate) client: Client,
/// When `true` no mutation is sent to the API server; diffs are printed
/// to stdout instead. Initialised from the `DRY_RUN` environment variable.
pub(crate) dry_run: bool,
pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
pub(crate) discovery: Arc<OnceCell<Discovery>>,
}
impl Serialize for K8sClient {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!("K8sClient serialization is not meaningful; remove this impl if unused")
}
}
impl std::fmt::Debug for K8sClient {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!(
"K8sClient {{ namespace: {}, dry_run: {} }}",
self.client.default_namespace(),
self.dry_run,
))
}
}
impl K8sClient {
/// Create a client, reading `DRY_RUN` from the environment.
pub fn new(client: Client) -> Self {
Self {
dry_run: read_dry_run_from_env(),
client,
k8s_distribution: Arc::new(OnceCell::new()),
discovery: Arc::new(OnceCell::new()),
}
}
/// Create a client that always operates in dry-run mode, regardless of
/// the environment variable.
pub fn new_dry_run(client: Client) -> Self {
Self {
dry_run: true,
..Self::new(client)
}
}
/// Returns `true` if this client is operating in dry-run mode.
pub fn is_dry_run(&self) -> bool {
self.dry_run
}
pub async fn try_default() -> Result<Self, Error> {
Ok(Self::new(Client::try_default().await?))
}
pub async fn from_kubeconfig(path: &str) -> Option<Self> {
Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
}
pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
let mut opts = KubeConfigOptions::default();
opts.context = context;
Self::from_kubeconfig_with_opts(path, &opts).await
}
pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
let k = match Kubeconfig::read_from(path) {
Ok(k) => k,
Err(e) => {
error!("Failed to load kubeconfig from {path}: {e}");
return None;
}
};
Some(Self::new(
Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
))
}
}

View File

@@ -1,83 +0,0 @@
use std::time::Duration;
use kube::{Discovery, Error};
use log::{debug, error, info, trace, warn};
use tokio::sync::Mutex;
use tokio_retry::{Retry, strategy::ExponentialBackoff};
use crate::client::K8sClient;
use crate::types::KubernetesDistribution;
impl K8sClient {
pub async fn get_apiserver_version(
&self,
) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
self.client.clone().apiserver_version().await
}
/// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
pub async fn discovery(&self) -> Result<&Discovery, Error> {
let retry_strategy = ExponentialBackoff::from_millis(1000)
.max_delay(Duration::from_secs(32))
.take(6);
let attempt = Mutex::new(0u32);
Retry::spawn(retry_strategy, || async {
let mut n = attempt.lock().await;
*n += 1;
match self
.discovery
.get_or_try_init(async || {
debug!("Running Kubernetes API discovery (attempt {})", *n);
let d = Discovery::new(self.client.clone()).run().await?;
debug!("Kubernetes API discovery completed");
Ok(d)
})
.await
{
Ok(d) => Ok(d),
Err(e) => {
warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
Err(e)
}
}
})
.await
.map_err(|e| {
error!("Kubernetes API discovery failed after all retries: {}", e);
e
})
}
/// Detect which Kubernetes distribution is running. Result is cached for
/// the lifetime of the client.
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
self.k8s_distribution
.get_or_try_init(async || {
debug!("Detecting Kubernetes distribution");
let api_groups = self.client.list_api_groups().await?;
trace!("list_api_groups: {:?}", api_groups);
let version = self.get_apiserver_version().await?;
if api_groups
.groups
.iter()
.any(|g| g.name == "project.openshift.io")
{
info!("Detected distribution: OpenshiftFamily");
return Ok(KubernetesDistribution::OpenshiftFamily);
}
if version.git_version.contains("k3s") {
info!("Detected distribution: K3sFamily");
return Ok(KubernetesDistribution::K3sFamily);
}
info!("Distribution not identified, using Default");
Ok(KubernetesDistribution::Default)
})
.await
.cloned()
}
}

View File

@@ -1,13 +0,0 @@
pub mod apply;
pub mod bundle;
pub mod client;
pub mod config;
pub mod discovery;
pub mod helper;
pub mod node;
pub mod pod;
pub mod resources;
pub mod types;
pub use client::K8sClient;
pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};

View File

@@ -1,3 +0,0 @@
fn main() {
println!("Hello, world!");
}

View File

@@ -1,722 +0,0 @@
use std::collections::BTreeMap;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use k8s_openapi::api::core::v1::{
ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::{
Error,
api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
core::ErrorResponse,
error::DiscoveryError,
};
use log::{debug, error, info, warn};
use tokio::time::sleep;
use crate::client::K8sClient;
use crate::helper::{self, PrivilegedPodConfig};
use crate::types::{DrainOptions, NodeFile};
impl K8sClient {
pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
Api::<Node>::all(self.client.clone())
.cordon(node_name)
.await?;
Ok(())
}
pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
Api::<Node>::all(self.client.clone())
.uncordon(node_name)
.await?;
Ok(())
}
pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
.await
}
async fn wait_for_node_ready_with_timeout(
&self,
node_name: &str,
timeout: Duration,
) -> Result<(), Error> {
let api: Api<Node> = Api::all(self.client.clone());
let start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
loop {
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not become Ready within {timeout:?}"
))));
}
match api.get(node_name).await {
Ok(node) => {
if node
.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|conds| {
conds
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
})
.unwrap_or(false)
{
debug!("Node '{node_name}' is Ready");
return Ok(());
}
}
Err(e) => debug!("Error polling node '{node_name}': {e}"),
}
sleep(poll).await;
}
}
async fn wait_for_node_not_ready(
&self,
node_name: &str,
timeout: Duration,
) -> Result<(), Error> {
let api: Api<Node> = Api::all(self.client.clone());
let start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
loop {
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not become NotReady within {timeout:?}"
))));
}
match api.get(node_name).await {
Ok(node) => {
let is_ready = node
.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|conds| {
conds
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
})
.unwrap_or(false);
if !is_ready {
debug!("Node '{node_name}' is NotReady");
return Ok(());
}
}
Err(e) => debug!("Error polling node '{node_name}': {e}"),
}
sleep(poll).await;
}
}
async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
let api: Api<Pod> = Api::all(self.client.clone());
Ok(api
.list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
.await?
.items)
}
fn is_mirror_pod(pod: &Pod) -> bool {
pod.metadata
.annotations
.as_ref()
.map(|a| a.contains_key("kubernetes.io/config.mirror"))
.unwrap_or(false)
}
fn is_daemonset_pod(pod: &Pod) -> bool {
pod.metadata
.owner_references
.as_ref()
.map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
.unwrap_or(false)
}
fn has_emptydir_volume(pod: &Pod) -> bool {
pod.spec
.as_ref()
.and_then(|s| s.volumes.as_ref())
.map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
.unwrap_or(false)
}
fn is_completed_pod(pod: &Pod) -> bool {
pod.status
.as_ref()
.and_then(|s| s.phase.as_deref())
.map(|phase| phase == "Succeeded" || phase == "Failed")
.unwrap_or(false)
}
fn classify_pods_for_drain(
pods: &[Pod],
options: &DrainOptions,
) -> Result<(Vec<Pod>, Vec<String>), String> {
let mut evictable = Vec::new();
let mut skipped = Vec::new();
let mut blocking = Vec::new();
for pod in pods {
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
let qualified = format!("{ns}/{name}");
if Self::is_mirror_pod(pod) {
skipped.push(format!("{qualified} (mirror pod)"));
continue;
}
if Self::is_completed_pod(pod) {
skipped.push(format!("{qualified} (completed)"));
continue;
}
if Self::is_daemonset_pod(pod) {
if options.ignore_daemonsets {
skipped.push(format!("{qualified} (DaemonSet-managed)"));
} else {
blocking.push(format!(
"{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
));
}
continue;
}
if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
blocking.push(format!(
"{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
));
continue;
}
evictable.push(pod.clone());
}
if !blocking.is_empty() {
return Err(format!(
"Cannot drain node — the following pods block eviction:\n - {}",
blocking.join("\n - ")
));
}
Ok((evictable, skipped))
}
async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
let name = pod.metadata.name.as_deref().unwrap_or_default();
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
debug!("Evicting pod {ns}/{name}");
Api::<Pod>::namespaced(self.client.clone(), ns)
.evict(name, &EvictParams::default())
.await
.map(|_| ())
}
/// Drains a node: cordon → classify → evict & wait.
pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
debug!("Cordoning '{node_name}'");
self.cordon_node(node_name).await?;
let pods = self.list_pods_on_node(node_name).await?;
debug!("Found {} pod(s) on '{node_name}'", pods.len());
let (evictable, skipped) =
Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
error!("{msg}");
Error::Discovery(DiscoveryError::MissingResource(msg))
})?;
for s in &skipped {
info!("Skipping pod: {s}");
}
if evictable.is_empty() {
info!("No pods to evict on '{node_name}'");
return Ok(());
}
info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
let mut start = tokio::time::Instant::now();
let poll = Duration::from_secs(5);
let mut pending = evictable;
loop {
for pod in &pending {
match self.evict_pod(pod).await {
Ok(()) => {}
Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
Err(Error::Api(ErrorResponse { code: 429, .. })) => {
warn!(
"PDB blocked eviction of {}/{}; will retry",
pod.metadata.namespace.as_deref().unwrap_or(""),
pod.metadata.name.as_deref().unwrap_or("")
);
}
Err(e) => {
error!(
"Failed to evict {}/{}: {e}",
pod.metadata.namespace.as_deref().unwrap_or(""),
pod.metadata.name.as_deref().unwrap_or("")
);
return Err(e);
}
}
}
sleep(poll).await;
let mut still_present = Vec::new();
for pod in pending {
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
let name = pod.metadata.name.as_deref().unwrap_or_default();
match self.get_pod(name, Some(ns)).await? {
Some(_) => still_present.push(pod),
None => debug!("Pod {ns}/{name} evicted"),
}
}
pending = still_present;
if pending.is_empty() {
break;
}
if start.elapsed() > options.timeout {
match helper::prompt_drain_timeout_action(
node_name,
pending.len(),
options.timeout,
)? {
helper::DrainTimeoutAction::Accept => break,
helper::DrainTimeoutAction::Retry => {
start = tokio::time::Instant::now();
continue;
}
helper::DrainTimeoutAction::Abort => {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Drain aborted. {} pod(s) remaining on '{node_name}'",
pending.len()
))));
}
}
}
debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
}
debug!("'{node_name}' drained successfully");
Ok(())
}
/// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
pub async fn reboot_node(
&self,
node_name: &str,
drain_options: &DrainOptions,
timeout: Duration,
) -> Result<(), Error> {
info!("Starting reboot for '{node_name}'");
let node_api: Api<Node> = Api::all(self.client.clone());
let boot_id_before = node_api
.get(node_name)
.await?
.status
.as_ref()
.and_then(|s| s.node_info.as_ref())
.map(|ni| ni.boot_id.clone())
.ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' has no boot_id in status"
)))
})?;
info!("Draining '{node_name}'");
self.drain_node(node_name, drain_options).await?;
let start = tokio::time::Instant::now();
info!("Scheduling reboot for '{node_name}'");
let reboot_cmd =
"echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
match self
.run_privileged_command_on_node(node_name, reboot_cmd)
.await
{
Ok(_) => debug!("Reboot command dispatched"),
Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
}
info!("Waiting for '{node_name}' to begin shutdown");
self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
.await?;
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timeout during reboot of '{node_name}' (shutdown phase)"
))));
}
info!("Waiting for '{node_name}' to come back online");
self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
.await?;
if start.elapsed() > timeout {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timeout during reboot of '{node_name}' (ready phase)"
))));
}
let boot_id_after = node_api
.get(node_name)
.await?
.status
.as_ref()
.and_then(|s| s.node_info.as_ref())
.map(|ni| ni.boot_id.clone())
.ok_or_else(|| {
Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' has no boot_id after reboot"
)))
})?;
if boot_id_before == boot_id_after {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
))));
}
info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
self.uncordon_node(node_name).await?;
info!("'{node_name}' reboot complete ({:?})", start.elapsed());
Ok(())
}
/// Write a set of files to a node's filesystem via a privileged ephemeral pod.
pub async fn write_files_to_node(
&self,
node_name: &str,
files: &[NodeFile],
) -> Result<String, Error> {
let ns = self.client.default_namespace();
let suffix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis();
let name = format!("harmony-k8s-writer-{suffix}");
debug!("Writing {} file(s) to '{node_name}'", files.len());
let mut data = BTreeMap::new();
let mut script = String::from("set -e\n");
for (i, file) in files.iter().enumerate() {
let key = format!("f{i}");
data.insert(key.clone(), file.content.clone());
script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
}
let cm = ConfigMap {
metadata: ObjectMeta {
name: Some(name.clone()),
namespace: Some(ns.to_string()),
..Default::default()
},
data: Some(data),
..Default::default()
};
let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
cm_api.create(&PostParams::default(), &cm).await?;
debug!("Created ConfigMap '{name}'");
let (host_vol, host_mount) = helper::host_root_volume();
let payload_vol = Volume {
name: "payload".to_string(),
config_map: Some(ConfigMapVolumeSource {
name: name.clone(),
..Default::default()
}),
..Default::default()
};
let payload_mount = VolumeMount {
name: "payload".to_string(),
mount_path: "/payload".to_string(),
..Default::default()
};
let bundle = helper::build_privileged_bundle(
PrivilegedPodConfig {
name: name.clone(),
namespace: ns.to_string(),
node_name: node_name.to_string(),
container_name: "writer".to_string(),
command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
volumes: vec![payload_vol, host_vol],
volume_mounts: vec![payload_mount, host_mount],
host_pid: false,
host_network: false,
},
&self.get_k8s_distribution().await?,
);
bundle.apply(self).await?;
debug!("Created privileged pod bundle '{name}'");
let result = self.wait_for_pod_completion(&name, ns).await;
debug!("Cleaning up '{name}'");
let _ = bundle.delete(self).await;
let _ = cm_api.delete(&name, &DeleteParams::default()).await;
result
}
/// Run a privileged command on a node via an ephemeral pod.
pub async fn run_privileged_command_on_node(
&self,
node_name: &str,
command: &str,
) -> Result<String, Error> {
let namespace = self.client.default_namespace();
let suffix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_millis();
let name = format!("harmony-k8s-cmd-{suffix}");
debug!("Running privileged command on '{node_name}': {command}");
let (host_vol, host_mount) = helper::host_root_volume();
let bundle = helper::build_privileged_bundle(
PrivilegedPodConfig {
name: name.clone(),
namespace: namespace.to_string(),
node_name: node_name.to_string(),
container_name: "runner".to_string(),
command: vec![
"/bin/bash".to_string(),
"-c".to_string(),
command.to_string(),
],
volumes: vec![host_vol],
volume_mounts: vec![host_mount],
host_pid: true,
host_network: true,
},
&self.get_k8s_distribution().await?,
);
bundle.apply(self).await?;
debug!("Privileged pod '{name}' created");
let result = self.wait_for_pod_completion(&name, namespace).await;
debug!("Cleaning up '{name}'");
let _ = bundle.delete(self).await;
result
}
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
use super::*;
fn base_pod(name: &str, ns: &str) -> Pod {
Pod {
metadata: ObjectMeta {
name: Some(name.to_string()),
namespace: Some(ns.to_string()),
..Default::default()
},
spec: Some(PodSpec::default()),
status: Some(PodStatus {
phase: Some("Running".to_string()),
..Default::default()
}),
}
}
fn mirror_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
"kubernetes.io/config.mirror".to_string(),
"abc123".to_string(),
)]));
pod
}
fn daemonset_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.metadata.owner_references = Some(vec![OwnerReference {
api_version: "apps/v1".to_string(),
kind: "DaemonSet".to_string(),
name: "some-ds".to_string(),
uid: "uid-ds".to_string(),
..Default::default()
}]);
pod
}
fn emptydir_pod(name: &str, ns: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.spec = Some(PodSpec {
volumes: Some(vec![Volume {
name: "scratch".to_string(),
empty_dir: Some(EmptyDirVolumeSource::default()),
..Default::default()
}]),
..Default::default()
});
pod
}
fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
let mut pod = base_pod(name, ns);
pod.status = Some(PodStatus {
phase: Some(phase.to_string()),
..Default::default()
});
pod
}
fn default_opts() -> DrainOptions {
DrainOptions::default()
}
// All test bodies are identical to the original — only the module path changed.
#[test]
fn empty_pod_list_returns_empty_vecs() {
let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s.is_empty());
}
#[test]
fn normal_pod_is_evictable() {
let pods = vec![base_pod("web", "default")];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert_eq!(e.len(), 1);
assert!(s.is_empty());
}
#[test]
fn mirror_pod_is_skipped() {
let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("mirror pod"));
}
#[test]
fn completed_pods_are_skipped() {
for phase in ["Succeeded", "Failed"] {
let pods = vec![completed_pod("job", "batch", phase)];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("completed"));
}
}
#[test]
fn daemonset_skipped_when_ignored() {
let pods = vec![daemonset_pod("fluentd", "logging")];
let opts = DrainOptions {
ignore_daemonsets: true,
..default_opts()
};
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
assert!(e.is_empty());
assert!(s[0].contains("DaemonSet-managed"));
}
#[test]
fn daemonset_blocks_when_not_ignored() {
let pods = vec![daemonset_pod("fluentd", "logging")];
let opts = DrainOptions {
ignore_daemonsets: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
}
#[test]
fn emptydir_blocks_without_flag() {
let pods = vec![emptydir_pod("cache", "default")];
let opts = DrainOptions {
delete_emptydir_data: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("emptyDir") && err.contains("default/cache"));
}
#[test]
fn emptydir_evictable_with_flag() {
let pods = vec![emptydir_pod("cache", "default")];
let opts = DrainOptions {
delete_emptydir_data: true,
..default_opts()
};
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
assert_eq!(e.len(), 1);
assert!(s.is_empty());
}
#[test]
fn multiple_blocking_all_reported() {
let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
let opts = DrainOptions {
ignore_daemonsets: false,
delete_emptydir_data: false,
..default_opts()
};
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
}
#[test]
fn mixed_pods_classified_correctly() {
let pods = vec![
base_pod("web", "default"),
mirror_pod("kube-apiserver", "kube-system"),
daemonset_pod("fluentd", "logging"),
completed_pod("job", "batch", "Succeeded"),
base_pod("api", "default"),
];
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
let names: Vec<&str> = e
.iter()
.map(|p| p.metadata.name.as_deref().unwrap())
.collect();
assert_eq!(names, vec!["web", "api"]);
assert_eq!(s.len(), 3);
}
#[test]
fn mirror_checked_before_completed() {
let mut pod = mirror_pod("static-etcd", "kube-system");
pod.status = Some(PodStatus {
phase: Some("Succeeded".to_string()),
..Default::default()
});
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
}
#[test]
fn completed_checked_before_daemonset() {
let mut pod = daemonset_pod("collector", "monitoring");
pod.status = Some(PodStatus {
phase: Some("Failed".to_string()),
..Default::default()
});
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
assert!(s[0].contains("completed"), "got: {}", s[0]);
}
}

View File

@@ -1,193 +0,0 @@
use std::time::Duration;
use k8s_openapi::api::core::v1::Pod;
use kube::{
Error,
api::{Api, AttachParams, ListParams},
error::DiscoveryError,
runtime::reflector::Lookup,
};
use log::debug;
use tokio::io::AsyncReadExt;
use tokio::time::sleep;
use crate::client::K8sClient;
impl K8sClient {
pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
api.get_opt(name).await
}
pub async fn wait_for_pod_ready(
&self,
pod_name: &str,
namespace: Option<&str>,
) -> Result<(), Error> {
let mut elapsed = 0u64;
let interval = 5u64;
let timeout_secs = 120u64;
loop {
if let Some(p) = self.get_pod(pod_name, namespace).await? {
if let Some(phase) = p.status.and_then(|s| s.phase) {
if phase.to_lowercase() == "running" {
return Ok(());
}
}
}
if elapsed >= timeout_secs {
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Pod '{}' in '{}' did not become ready within {timeout_secs}s",
pod_name,
namespace.unwrap_or("<default>"),
))));
}
sleep(Duration::from_secs(interval)).await;
elapsed += interval;
}
}
/// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
/// logs. Used internally by node operations.
pub(crate) async fn wait_for_pod_completion(
&self,
name: &str,
namespace: &str,
) -> Result<String, Error> {
let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
let poll_interval = Duration::from_secs(2);
for _ in 0..60 {
sleep(poll_interval).await;
let p = api.get(name).await?;
match p.status.and_then(|s| s.phase).as_deref() {
Some("Succeeded") => {
let logs = api
.logs(name, &Default::default())
.await
.unwrap_or_default();
debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
return Ok(logs);
}
Some("Failed") => {
let logs = api
.logs(name, &Default::default())
.await
.unwrap_or_default();
debug!("Pod {namespace}/{name} failed. Logs: {logs}");
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Pod '{name}' failed.\n{logs}"
))));
}
_ => {}
}
}
Err(Error::Discovery(DiscoveryError::MissingResource(format!(
"Timed out waiting for pod '{name}'"
))))
}
/// Execute a command in the first pod matching `{label}={name}`.
pub async fn exec_app_capture_output(
&self,
name: String,
label: String,
namespace: Option<&str>,
command: Vec<&str>,
) -> Result<String, String> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let pod_list = api
.list(&ListParams::default().labels(&format!("{label}={name}")))
.await
.expect("Failed to list pods");
let pod_name = pod_list
.items
.first()
.expect("No matching pod")
.name()
.expect("Pod has no name")
.into_owned();
match api
.exec(
&pod_name,
command,
&AttachParams::default().stdout(true).stderr(true),
)
.await
{
Err(e) => Err(e.to_string()),
Ok(mut process) => {
let status = process
.take_status()
.expect("No status handle")
.await
.expect("Status channel closed");
if let Some(s) = status.status {
let mut buf = String::new();
if let Some(mut stdout) = process.stdout() {
stdout
.read_to_string(&mut buf)
.await
.map_err(|e| format!("Failed to read stdout: {e}"))?;
}
debug!("exec status: {} - {:?}", s, status.details);
if s == "Success" { Ok(buf) } else { Err(s) }
} else {
Err("No inner status from pod exec".to_string())
}
}
}
}
/// Execute a command in the first pod matching
/// `app.kubernetes.io/name={name}`.
pub async fn exec_app(
&self,
name: String,
namespace: Option<&str>,
command: Vec<&str>,
) -> Result<(), String> {
let api: Api<Pod> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let pod_list = api
.list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
.await
.expect("Failed to list pods");
let pod_name = pod_list
.items
.first()
.expect("No matching pod")
.name()
.expect("Pod has no name")
.into_owned();
match api.exec(&pod_name, command, &AttachParams::default()).await {
Err(e) => Err(e.to_string()),
Ok(mut process) => {
let status = process
.take_status()
.expect("No status handle")
.await
.expect("Status channel closed");
if let Some(s) = status.status {
debug!("exec status: {} - {:?}", s, status.details);
if s == "Success" { Ok(()) } else { Err(s) }
} else {
Err("No inner status from pod exec".to_string())
}
}
}
}
}

View File

@@ -1,316 +0,0 @@
use std::collections::HashMap;
use k8s_openapi::api::{
apps::v1::Deployment,
core::v1::{Node, ServiceAccount},
};
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
use kube::api::ApiResource;
use kube::{
Error, Resource,
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
runtime::conditions,
runtime::wait::await_condition,
};
use log::debug;
use serde::de::DeserializeOwned;
use serde_json::Value;
use std::time::Duration;
use crate::client::K8sClient;
use crate::types::ScopeResolver;
impl K8sClient {
pub async fn has_healthy_deployment_with_label(
&self,
namespace: &str,
label_selector: &str,
) -> Result<bool, Error> {
let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
let list = api
.list(&ListParams::default().labels(label_selector))
.await?;
for d in list.items {
let available = d
.status
.as_ref()
.and_then(|s| s.available_replicas)
.unwrap_or(0);
if available > 0 {
return Ok(true);
}
if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
if conds
.iter()
.any(|c| c.type_ == "Available" && c.status == "True")
{
return Ok(true);
}
}
}
Ok(false)
}
pub async fn list_namespaces_with_healthy_deployments(
&self,
label_selector: &str,
) -> Result<Vec<String>, Error> {
let api: Api<Deployment> = Api::all(self.client.clone());
let list = api
.list(&ListParams::default().labels(label_selector))
.await?;
let mut healthy_ns: HashMap<String, bool> = HashMap::new();
for d in list.items {
let ns = match d.metadata.namespace.clone() {
Some(n) => n,
None => continue,
};
let available = d
.status
.as_ref()
.and_then(|s| s.available_replicas)
.unwrap_or(0);
let is_healthy = if available > 0 {
true
} else {
d.status
.as_ref()
.and_then(|s| s.conditions.as_ref())
.map(|c| {
c.iter()
.any(|c| c.type_ == "Available" && c.status == "True")
})
.unwrap_or(false)
};
if is_healthy {
healthy_ns.insert(ns, true);
}
}
Ok(healthy_ns.into_keys().collect())
}
pub async fn get_controller_service_account_name(
&self,
ns: &str,
) -> Result<Option<String>, Error> {
let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
let list = api
.list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
.await?;
if let Some(dep) = list.items.first() {
if let Some(sa) = dep
.spec
.as_ref()
.and_then(|s| s.template.spec.as_ref())
.and_then(|s| s.service_account_name.clone())
{
return Ok(Some(sa));
}
}
Ok(None)
}
pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
let ar = ApiResource::from_gvk(&gvk);
let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
let list = api.list(&ListParams::default()).await?;
Ok(list
.items
.into_iter()
.map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
.collect())
}
pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
let sa_user = format!("system:serviceaccount:{ns}:{sa}");
for crb in self.list_clusterrolebindings_json().await? {
if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
for subj in subjects {
let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
|| (kind == "User" && name == sa_user)
{
return Ok(true);
}
}
}
}
Ok(false)
}
pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
let crds = api
.list(&ListParams::default().fields(&format!("metadata.name={name}")))
.await?;
Ok(!crds.items.is_empty())
}
pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
Api::namespaced(self.client.clone(), namespace)
}
pub async fn get_resource_json_value(
&self,
name: &str,
namespace: Option<&str>,
gvk: &GroupVersionKind,
) -> Result<DynamicObject, Error> {
let ar = ApiResource::from_gvk(gvk);
let api: Api<DynamicObject> = match namespace {
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
None => Api::default_namespaced_with(self.client.clone(), &ar),
};
api.get(name).await
}
pub async fn get_secret_json_value(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<DynamicObject, Error> {
self.get_resource_json_value(
name,
namespace,
&GroupVersionKind {
group: String::new(),
version: "v1".to_string(),
kind: "Secret".to_string(),
},
)
.await
}
pub async fn get_deployment(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<Option<Deployment>, Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => {
debug!("Getting namespaced deployment '{name}' in '{ns}'");
Api::namespaced(self.client.clone(), ns)
}
None => {
debug!("Getting deployment '{name}' in default namespace");
Api::default_namespaced(self.client.clone())
}
};
api.get_opt(name).await
}
pub async fn scale_deployment(
&self,
name: &str,
namespace: Option<&str>,
replicas: u32,
) -> Result<(), Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
use kube::api::{Patch, PatchParams};
use serde_json::json;
let patch = json!({ "spec": { "replicas": replicas } });
api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
.await?;
Ok(())
}
pub async fn delete_deployment(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<(), Error> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
api.delete(name, &kube::api::DeleteParams::default())
.await?;
Ok(())
}
pub async fn wait_until_deployment_ready(
&self,
name: &str,
namespace: Option<&str>,
timeout: Option<Duration>,
) -> Result<(), String> {
let api: Api<Deployment> = match namespace {
Some(ns) => Api::namespaced(self.client.clone(), ns),
None => Api::default_namespaced(self.client.clone()),
};
let timeout = timeout.unwrap_or(Duration::from_secs(120));
let establish = await_condition(api, name, conditions::is_deployment_completed());
tokio::time::timeout(timeout, establish)
.await
.map(|_| ())
.map_err(|_| "Timed out waiting for deployment".to_string())
}
/// Gets a single named resource, using the correct API scope for `K`.
pub async fn get_resource<K>(
&self,
name: &str,
namespace: Option<&str>,
) -> Result<Option<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
let api: Api<K> =
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
api.get_opt(name).await
}
pub async fn list_resources<K>(
&self,
namespace: Option<&str>,
list_params: Option<ListParams>,
) -> Result<ObjectList<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
let api: Api<K> =
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
api.list(&list_params.unwrap_or_default()).await
}
pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::DynamicType: Default,
{
Api::<K>::all(self.client.clone())
.list(&ListParams::default().labels(labels))
.await
.map(|l| l.items)
}
pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
where
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
<K as Resource>::Scope: ScopeResolver<K>,
<K as Resource>::DynamicType: Default,
{
Api::<K>::all(self.client.clone())
.list(&Default::default())
.await
.map(|l| l.items)
}
pub async fn get_nodes(
&self,
list_params: Option<ListParams>,
) -> Result<ObjectList<Node>, Error> {
self.list_resources(None, list_params).await
}
}

View File

@@ -1,100 +0,0 @@
use std::time::Duration;
use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
use kube::{Api, Client, Resource};
use serde::Serialize;
/// Which Kubernetes distribution is running. Detected once at runtime via
/// [`crate::discovery::K8sClient::get_k8s_distribution`].
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub enum KubernetesDistribution {
Default,
OpenshiftFamily,
K3sFamily,
}
/// A file to be written to a node's filesystem.
#[derive(Debug, Clone)]
pub struct NodeFile {
/// Absolute path on the host where the file should be written.
pub path: String,
/// Content of the file.
pub content: String,
/// UNIX permissions (e.g. `0o600`).
pub mode: u32,
}
/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
#[derive(Debug, Clone)]
pub struct DrainOptions {
/// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
/// Equivalent to `kubectl drain --delete-emptydir-data`.
pub delete_emptydir_data: bool,
/// Silently skip DaemonSet-managed pods instead of blocking the drain.
/// Equivalent to `kubectl drain --ignore-daemonsets`.
pub ignore_daemonsets: bool,
/// Maximum wall-clock time to wait for all evictions to complete.
pub timeout: Duration,
}
impl Default for DrainOptions {
fn default() -> Self {
Self {
delete_emptydir_data: false,
ignore_daemonsets: true,
timeout: Duration::from_secs(1),
}
}
}
impl DrainOptions {
pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
Self {
delete_emptydir_data: true,
ignore_daemonsets: true,
..Self::default()
}
}
}
/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
/// resource already exists (or does not).
pub enum WriteMode {
/// Server-side apply; create if absent, update if present (default).
CreateOrUpdate,
/// POST only; return an error if the resource already exists.
Create,
/// Server-side apply only; return an error if the resource does not exist.
Update,
}
// ── Scope resolution trait ───────────────────────────────────────────────────
/// Resolves the correct [`kube::Api`] for a resource type based on its scope
/// (cluster-wide vs. namespace-scoped).
pub trait ScopeResolver<K: Resource> {
fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
}
impl<K> ScopeResolver<K> for ClusterResourceScope
where
K: Resource<Scope = ClusterResourceScope>,
<K as Resource>::DynamicType: Default,
{
fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
Api::all(client.clone())
}
}
impl<K> ScopeResolver<K> for NamespaceResourceScope
where
K: Resource<Scope = NamespaceResourceScope>,
<K as Resource>::DynamicType: Default,
{
fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
match ns {
Some(ns) => Api::namespaced(client.clone(), ns),
None => Api::default_namespaced(client.clone()),
}
}
}

View File

@@ -21,8 +21,6 @@ semver = "1.0.23"
serde.workspace = true
serde_json.workspace = true
tokio.workspace = true
tokio-retry.workspace = true
tokio-util.workspace = true
derive-new.workspace = true
log.workspace = true
env_logger.workspace = true
@@ -33,7 +31,6 @@ opnsense-config-xml = { path = "../opnsense-config-xml" }
harmony_macros = { path = "../harmony_macros" }
harmony_types = { path = "../harmony_types" }
harmony_execution = { path = "../harmony_execution" }
harmony-k8s = { path = "../harmony-k8s" }
uuid.workspace = true
url.workspace = true
kube = { workspace = true, features = ["derive"] }
@@ -63,6 +60,7 @@ temp-dir = "0.1.14"
dyn-clone = "1.0.19"
similar.workspace = true
futures-util = "0.3.31"
tokio-util = "0.7.15"
strum = { version = "0.27.1", features = ["derive"] }
tempfile.workspace = true
serde_with = "3.14.0"
@@ -82,7 +80,7 @@ sqlx.workspace = true
inquire.workspace = true
brocade = { path = "../brocade" }
option-ext = "0.2.0"
rand.workspace = true
tokio-retry = "0.3.0"
[dev-dependencies]
pretty_assertions.workspace = true

View File

@@ -4,6 +4,8 @@ use std::error::Error;
use async_trait::async_trait;
use derive_new::new;
use crate::inventory::HostRole;
use super::{
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
};

View File

@@ -1,5 +1,4 @@
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use harmony_macros::ip;
use harmony_types::{
id::Id,
@@ -9,7 +8,7 @@ use harmony_types::{
use log::debug;
use log::info;
use crate::topology::{HelmCommand, PxeOptions};
use crate::topology::PxeOptions;
use crate::{data::FileContent, executors::ExecutorError, topology::node_exporter::NodeExporter};
use crate::{infra::network_manager::OpenShiftNmStateNetworkManager, topology::PortConfig};
@@ -17,12 +16,9 @@ use super::{
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
SwitchError, TftpServer, Topology,
};
use std::{
process::Command,
sync::{Arc, OnceLock},
SwitchError, TftpServer, Topology, k8s::K8sClient,
};
use std::sync::{Arc, OnceLock};
#[derive(Debug, Clone)]
pub struct HAClusterTopology {
@@ -56,30 +52,6 @@ impl Topology for HAClusterTopology {
}
}
impl HelmCommand for HAClusterTopology {
fn get_helm_command(&self) -> Command {
let mut cmd = Command::new("helm");
if let Some(k) = &self.kubeconfig {
cmd.args(["--kubeconfig", k]);
}
// FIXME we should support context anywhere there is a k8sclient
// This likely belongs in the k8sclient itself and should be extracted to a separate
// crate
//
// I feel like helm could very well be a feature of this external k8s client.
//
// Same for kustomize
//
// if let Some(c) = &self.k8s_context {
// cmd.args(["--kube-context", c]);
// }
info!("Using helm command {cmd:?}");
cmd
}
}
#[async_trait]
impl K8sclient for HAClusterTopology {
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {

View File

@@ -25,9 +25,9 @@
//!
//! ## Example
//!
//! ```
//! use harmony_k8s::{K8sClient, helper};
//! use harmony_k8s::KubernetesDistribution;
//! ```rust,no_run
//! use harmony::topology::k8s::{K8sClient, helper};
//! use harmony::topology::KubernetesDistribution;
//!
//! async fn write_network_config(client: &K8sClient, node: &str) {
//! // Create a bundle with platform-specific RBAC
@@ -56,7 +56,7 @@ use kube::{Error, Resource, ResourceExt, api::DynamicObject};
use serde::Serialize;
use serde_json;
use crate::K8sClient;
use crate::domain::topology::k8s::K8sClient;
/// A ResourceBundle represents a logical unit of work consisting of multiple
/// Kubernetes resources that should be applied or deleted together.

View File

@@ -1,7 +1,7 @@
use std::collections::BTreeMap;
use std::time::Duration;
use crate::KubernetesDistribution;
use crate::topology::KubernetesDistribution;
use super::bundle::ResourceBundle;
use super::config::PRIVILEGED_POD_IMAGE;
@@ -10,10 +10,8 @@ use k8s_openapi::api::core::v1::{
};
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
use kube::api::DynamicObject;
use kube::error::DiscoveryError;
use log::{debug, error, info, warn};
use serde::de::DeserializeOwned;
#[derive(Debug)]
pub struct PrivilegedPodConfig {
@@ -133,9 +131,9 @@ pub fn host_root_volume() -> (Volume, VolumeMount) {
///
/// # Example
///
/// ```
/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
/// use harmony_k8s::KubernetesDistribution;
/// ```rust,no_run
/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
/// # use harmony::topology::KubernetesDistribution;
/// let bundle = build_privileged_bundle(
/// PrivilegedPodConfig {
/// name: "network-setup".to_string(),
@@ -281,16 +279,6 @@ pub fn prompt_drain_timeout_action(
}
}
/// JSON round-trip: DynamicObject → K
///
/// Safe because the DynamicObject was produced by the apiserver from a
/// payload that was originally serialized from K, so the schema is identical.
pub(crate) fn dyn_to_typed<K: DeserializeOwned>(obj: DynamicObject) -> Result<K, kube::Error> {
serde_json::to_value(obj)
.and_then(serde_json::from_value)
.map_err(kube::Error::SerdeError)
}
#[cfg(test)]
mod tests {
use super::*;

File diff suppressed because it is too large Load Diff

View File

@@ -1,14 +1,12 @@
use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration};
use std::{collections::BTreeMap, process::Command, sync::Arc};
use async_trait::async_trait;
use base64::{Engine, engine::general_purpose};
use harmony_k8s::{K8sClient, KubernetesDistribution};
use harmony_types::rfc1123::Rfc1123Name;
use k8s_openapi::api::{
core::v1::{Pod, Secret},
rbac::v1::{ClusterRoleBinding, RoleRef, Subject},
};
use kube::api::{DynamicObject, GroupVersionKind, ObjectMeta};
use kube::api::{GroupVersionKind, ObjectMeta};
use log::{debug, info, trace, warn};
use serde::Serialize;
use tokio::sync::OnceCell;
@@ -29,28 +27,7 @@ use crate::{
score_cert_management::CertificateManagementScore,
},
k3d::K3DInstallationScore,
k8s::ingress::{K8sIngressScore, PathType},
monitoring::{
grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score},
kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus,
crd_grafana::{
Grafana as GrafanaCRD, GrafanaCom, GrafanaDashboard,
GrafanaDashboardDatasource, GrafanaDashboardSpec, GrafanaDatasource,
GrafanaDatasourceConfig, GrafanaDatasourceJsonData,
GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec,
},
crd_prometheuses::LabelSelector,
prometheus_operator::prometheus_operator_helm_chart_score,
rhob_alertmanager_config::RHOBObservability,
service_monitor::ServiceMonitor,
},
},
okd::{crd::ingresses_config::Ingress as IngressResource, route::OKDTlsPassthroughScore},
prometheus::{
k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore,
prometheus::PrometheusMonitoring, rhob_alerting_score::RHOBAlertingScore,
},
},
score::Score,
topology::{TlsRoute, TlsRouter, ingress::Ingress},
@@ -59,7 +36,7 @@ use crate::{
use super::super::{
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
PreparationOutcome, Topology,
oberservability::monitoring::AlertReceiver,
k8s::K8sClient,
tenant::{
TenantConfig, TenantManager,
k8s::K8sTenantManager,
@@ -76,6 +53,13 @@ struct K8sState {
message: String,
}
#[derive(Debug, Clone, Serialize)]
pub enum KubernetesDistribution {
OpenshiftFamily,
K3sFamily,
Default,
}
#[derive(Debug, Clone)]
enum K8sSource {
LocalK3d,
@@ -166,216 +150,6 @@ impl TlsRouter for K8sAnywhereTopology {
}
}
#[async_trait]
impl Grafana for K8sAnywhereTopology {
async fn ensure_grafana_operator(
&self,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
debug!("ensure grafana operator");
let client = self.k8s_client().await.unwrap();
let grafana_gvk = GroupVersionKind {
group: "grafana.integreatly.org".to_string(),
version: "v1beta1".to_string(),
kind: "Grafana".to_string(),
};
let name = "grafanas.grafana.integreatly.org";
let ns = "grafana";
let grafana_crd = client
.get_resource_json_value(name, Some(ns), &grafana_gvk)
.await;
match grafana_crd {
Ok(_) => {
return Ok(PreparationOutcome::Success {
details: "Found grafana CRDs in cluster".to_string(),
});
}
Err(_) => {
return self
.install_grafana_operator(inventory, Some("grafana"))
.await;
}
};
}
async fn install_grafana(&self) -> Result<PreparationOutcome, PreparationError> {
let ns = "grafana";
let mut label = BTreeMap::new();
label.insert("dashboards".to_string(), "grafana".to_string());
let label_selector = LabelSelector {
match_labels: label.clone(),
match_expressions: vec![],
};
let client = self.k8s_client().await?;
let grafana = self.build_grafana(ns, &label);
client.apply(&grafana, Some(ns)).await?;
//TODO change this to a ensure ready or something better than just a timeout
client
.wait_until_deployment_ready(
"grafana-grafana-deployment",
Some("grafana"),
Some(Duration::from_secs(30)),
)
.await?;
let sa_name = "grafana-grafana-sa";
let token_secret_name = "grafana-sa-token-secret";
let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns);
client.apply(&sa_token_secret, Some(ns)).await?;
let secret_gvk = GroupVersionKind {
group: "".to_string(),
version: "v1".to_string(),
kind: "Secret".to_string(),
};
let secret = client
.get_resource_json_value(token_secret_name, Some(ns), &secret_gvk)
.await?;
let token = format!(
"Bearer {}",
self.extract_and_normalize_token(&secret).unwrap()
);
debug!("creating grafana clusterrole binding");
let clusterrolebinding =
self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns);
client.apply(&clusterrolebinding, Some(ns)).await?;
debug!("creating grafana datasource crd");
let thanos_url = format!(
"https://{}",
self.get_domain("thanos-querier-openshift-monitoring")
.await
.unwrap()
);
let thanos_openshift_datasource = self.build_grafana_datasource(
"thanos-openshift-monitoring",
ns,
&label_selector,
&thanos_url,
&token,
);
client.apply(&thanos_openshift_datasource, Some(ns)).await?;
debug!("creating grafana dashboard crd");
let dashboard = self.build_grafana_dashboard(ns, &label_selector);
client.apply(&dashboard, Some(ns)).await?;
debug!("creating grafana ingress");
let grafana_ingress = self.build_grafana_ingress(ns).await;
grafana_ingress
.interpret(&Inventory::empty(), self)
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
Ok(PreparationOutcome::Success {
details: "Installed grafana composants".to_string(),
})
}
}
#[async_trait]
impl PrometheusMonitoring<CRDPrometheus> for K8sAnywhereTopology {
async fn install_prometheus(
&self,
sender: &CRDPrometheus,
_inventory: &Inventory,
_receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let client = self.k8s_client().await?;
for monitor in sender.service_monitor.iter() {
client
.apply(monitor, Some(&sender.namespace))
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
}
Ok(PreparationOutcome::Success {
details: "successfuly installed prometheus components".to_string(),
})
}
async fn ensure_prometheus_operator(
&self,
sender: &CRDPrometheus,
_inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let po_result = self.ensure_prometheus_operator(sender).await?;
match po_result {
PreparationOutcome::Success { details: _ } => {
debug!("Detected prometheus crds operator present in cluster.");
return Ok(po_result);
}
PreparationOutcome::Noop => {
debug!("Skipping Prometheus CR installation due to missing operator.");
return Ok(po_result);
}
}
}
}
#[async_trait]
impl PrometheusMonitoring<RHOBObservability> for K8sAnywhereTopology {
async fn install_prometheus(
&self,
sender: &RHOBObservability,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let po_result = self.ensure_cluster_observability_operator(sender).await?;
if po_result == PreparationOutcome::Noop {
debug!("Skipping Prometheus CR installation due to missing operator.");
return Ok(po_result);
}
let result = self
.get_cluster_observability_operator_prometheus_application_score(
sender.clone(),
receivers,
)
.await
.interpret(inventory, self)
.await;
match result {
Ok(outcome) => match outcome.status {
InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
details: outcome.message,
}),
InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
_ => Err(PreparationError::new(outcome.message)),
},
Err(err) => Err(PreparationError::new(err.to_string())),
}
}
async fn ensure_prometheus_operator(
&self,
sender: &RHOBObservability,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
todo!()
}
}
impl Serialize for K8sAnywhereTopology {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
@@ -580,23 +354,6 @@ impl K8sAnywhereTopology {
}
}
fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option<String> {
let token_b64 = secret
.data
.get("token")
.or_else(|| secret.data.get("data").and_then(|d| d.get("token")))
.and_then(|v| v.as_str())?;
let bytes = general_purpose::STANDARD.decode(token_b64).ok()?;
let s = String::from_utf8(bytes).ok()?;
let cleaned = s
.trim_matches(|c: char| c.is_whitespace() || c == '\0')
.to_string();
Some(cleaned)
}
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, PreparationError> {
self.k8s_client()
.await?
@@ -656,141 +413,6 @@ impl K8sAnywhereTopology {
}
}
fn build_grafana_datasource(
&self,
name: &str,
ns: &str,
label_selector: &LabelSelector,
url: &str,
token: &str,
) -> GrafanaDatasource {
let mut json_data = BTreeMap::new();
json_data.insert("timeInterval".to_string(), "5s".to_string());
GrafanaDatasource {
metadata: ObjectMeta {
name: Some(name.to_string()),
namespace: Some(ns.to_string()),
..Default::default()
},
spec: GrafanaDatasourceSpec {
instance_selector: label_selector.clone(),
allow_cross_namespace_import: Some(true),
values_from: None,
datasource: GrafanaDatasourceConfig {
access: "proxy".to_string(),
name: name.to_string(),
r#type: "prometheus".to_string(),
url: url.to_string(),
database: None,
json_data: Some(GrafanaDatasourceJsonData {
time_interval: Some("60s".to_string()),
http_header_name1: Some("Authorization".to_string()),
tls_skip_verify: Some(true),
oauth_pass_thru: Some(true),
}),
secure_json_data: Some(GrafanaDatasourceSecureJsonData {
http_header_value1: Some(format!("Bearer {token}")),
}),
is_default: Some(false),
editable: Some(true),
},
},
}
}
fn build_grafana_dashboard(
&self,
ns: &str,
label_selector: &LabelSelector,
) -> GrafanaDashboard {
let graf_dashboard = GrafanaDashboard {
metadata: ObjectMeta {
name: Some(format!("grafana-dashboard-{}", ns)),
namespace: Some(ns.to_string()),
..Default::default()
},
spec: GrafanaDashboardSpec {
resync_period: Some("30s".to_string()),
instance_selector: label_selector.clone(),
datasources: Some(vec![GrafanaDashboardDatasource {
input_name: "DS_PROMETHEUS".to_string(),
datasource_name: "thanos-openshift-monitoring".to_string(),
}]),
json: None,
grafana_com: Some(GrafanaCom {
id: 17406,
revision: None,
}),
},
};
graf_dashboard
}
fn build_grafana(&self, ns: &str, labels: &BTreeMap<String, String>) -> GrafanaCRD {
let grafana = GrafanaCRD {
metadata: ObjectMeta {
name: Some(format!("grafana-{}", ns)),
namespace: Some(ns.to_string()),
labels: Some(labels.clone()),
..Default::default()
},
spec: GrafanaSpec {
config: None,
admin_user: None,
admin_password: None,
ingress: None,
persistence: None,
resources: None,
},
};
grafana
}
async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore {
let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap();
let name = format!("{}-grafana", ns);
let backend_service = format!("grafana-{}-service", ns);
K8sIngressScore {
name: fqdn::fqdn!(&name),
host: fqdn::fqdn!(&domain),
backend_service: fqdn::fqdn!(&backend_service),
port: 3000,
path: Some("/".to_string()),
path_type: Some(PathType::Prefix),
namespace: Some(fqdn::fqdn!(&ns)),
ingress_class_name: Some("openshift-default".to_string()),
}
}
async fn get_cluster_observability_operator_prometheus_application_score(
&self,
sender: RHOBObservability,
receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
) -> RHOBAlertingScore {
RHOBAlertingScore {
sender,
receivers: receivers.unwrap_or_default(),
service_monitors: vec![],
prometheus_rules: vec![],
}
}
async fn get_k8s_prometheus_application_score(
&self,
sender: CRDPrometheus,
receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
service_monitors: Option<Vec<ServiceMonitor>>,
) -> K8sPrometheusCRDAlertingScore {
return K8sPrometheusCRDAlertingScore {
sender,
receivers: receivers.unwrap_or_default(),
service_monitors: service_monitors.unwrap_or_default(),
prometheus_rules: vec![],
};
}
async fn openshift_ingress_operator_available(&self) -> Result<(), PreparationError> {
let client = self.k8s_client().await?;
let gvk = GroupVersionKind {
@@ -956,137 +578,6 @@ impl K8sAnywhereTopology {
)),
}
}
async fn ensure_cluster_observability_operator(
&self,
sender: &RHOBObservability,
) -> Result<PreparationOutcome, PreparationError> {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i rhobs"])
.status()
.map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?;
if !status.success() {
if let Some(Some(k8s_state)) = self.k8s_state.get() {
match k8s_state.source {
K8sSource::LocalK3d => {
warn!(
"Installing observability operator is not supported on LocalK3d source"
);
return Ok(PreparationOutcome::Noop);
debug!("installing cluster observability operator");
todo!();
let op_score =
prometheus_operator_helm_chart_score(sender.namespace.clone());
let result = op_score.interpret(&Inventory::empty(), self).await;
return match result {
Ok(outcome) => match outcome.status {
InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
details: "installed cluster observability operator".into(),
}),
InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
_ => Err(PreparationError::new(
"failed to install cluster observability operator (unknown error)".into(),
)),
},
Err(err) => Err(PreparationError::new(err.to_string())),
};
}
K8sSource::Kubeconfig => {
debug!(
"unable to install cluster observability operator, contact cluster admin"
);
return Ok(PreparationOutcome::Noop);
}
}
} else {
warn!(
"Unable to detect k8s_state. Skipping Cluster Observability Operator install."
);
return Ok(PreparationOutcome::Noop);
}
}
debug!("Cluster Observability Operator is already present, skipping install");
Ok(PreparationOutcome::Success {
details: "cluster observability operator present in cluster".into(),
})
}
async fn ensure_prometheus_operator(
&self,
sender: &CRDPrometheus,
) -> Result<PreparationOutcome, PreparationError> {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i prometheuses"])
.status()
.map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?;
if !status.success() {
if let Some(Some(k8s_state)) = self.k8s_state.get() {
match k8s_state.source {
K8sSource::LocalK3d => {
debug!("installing prometheus operator");
let op_score =
prometheus_operator_helm_chart_score(sender.namespace.clone());
let result = op_score.interpret(&Inventory::empty(), self).await;
return match result {
Ok(outcome) => match outcome.status {
InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
details: "installed prometheus operator".into(),
}),
InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
_ => Err(PreparationError::new(
"failed to install prometheus operator (unknown error)".into(),
)),
},
Err(err) => Err(PreparationError::new(err.to_string())),
};
}
K8sSource::Kubeconfig => {
debug!("unable to install prometheus operator, contact cluster admin");
return Ok(PreparationOutcome::Noop);
}
}
} else {
warn!("Unable to detect k8s_state. Skipping Prometheus Operator install.");
return Ok(PreparationOutcome::Noop);
}
}
debug!("Prometheus operator is already present, skipping install");
Ok(PreparationOutcome::Success {
details: "prometheus operator present in cluster".into(),
})
}
async fn install_grafana_operator(
&self,
inventory: &Inventory,
ns: Option<&str>,
) -> Result<PreparationOutcome, PreparationError> {
let namespace = ns.unwrap_or("grafana");
info!("installing grafana operator in ns {namespace}");
let tenant = self.get_k8s_tenant_manager()?.get_tenant_config().await;
let mut namespace_scope = false;
if tenant.is_some() {
namespace_scope = true;
}
let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope)
.interpret(inventory, self)
.await
.map_err(|e| PreparationError::new(e.to_string()));
Ok(PreparationOutcome::Success {
details: format!(
"Successfully installed grafana operator in ns {}",
ns.unwrap()
),
})
}
}
#[derive(Clone, Debug)]

View File

@@ -1,4 +1,5 @@
mod k8s_anywhere;
pub mod nats;
pub mod observability;
mod postgres;
pub use k8s_anywhere::*;

View File

@@ -0,0 +1,147 @@
use async_trait::async_trait;
use crate::{
inventory::Inventory,
modules::monitoring::grafana::{
grafana::Grafana,
k8s::{
score_ensure_grafana_ready::GrafanaK8sEnsureReadyScore,
score_grafana_alert_receiver::GrafanaK8sReceiverScore,
score_grafana_datasource::GrafanaK8sDatasourceScore,
score_grafana_rule::GrafanaK8sRuleScore, score_install_grafana::GrafanaK8sInstallScore,
},
},
score::Score,
topology::{
K8sAnywhereTopology, PreparationError, PreparationOutcome,
monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget},
},
};
#[async_trait]
impl Observability<Grafana> for K8sAnywhereTopology {
async fn install_alert_sender(
&self,
sender: &Grafana,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let score = GrafanaK8sInstallScore {
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Grafana not installed {}", e)))?;
Ok(PreparationOutcome::Success {
details: "Successfully installed grafana alert sender".to_string(),
})
}
async fn install_receivers(
&self,
sender: &Grafana,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<Grafana>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let receivers = match receivers {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for receiver in receivers {
let score = GrafanaK8sReceiverScore {
receiver,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert receivers installed successfully".to_string(),
})
}
async fn install_rules(
&self,
sender: &Grafana,
inventory: &Inventory,
rules: Option<Vec<Box<dyn AlertRule<Grafana>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let rules = match rules {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for rule in rules {
let score = GrafanaK8sRuleScore {
sender: sender.clone(),
rule,
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert rules installed successfully".to_string(),
})
}
async fn add_scrape_targets(
&self,
sender: &Grafana,
inventory: &Inventory,
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<Grafana>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let scrape_targets = match scrape_targets {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for scrape_target in scrape_targets {
let score = GrafanaK8sDatasourceScore {
scrape_target,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to add DataSource: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All datasources installed successfully".to_string(),
})
}
async fn ensure_monitoring_installed(
&self,
sender: &Grafana,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let score = GrafanaK8sEnsureReadyScore {
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Grafana not ready {}", e)))?;
Ok(PreparationOutcome::Success {
details: "Grafana Ready".to_string(),
})
}
}

View File

@@ -0,0 +1,142 @@
use async_trait::async_trait;
use crate::{
inventory::Inventory,
modules::monitoring::kube_prometheus::{
KubePrometheus, helm::kube_prometheus_helm_chart::kube_prometheus_helm_chart_score,
score_kube_prometheus_alert_receivers::KubePrometheusReceiverScore,
score_kube_prometheus_ensure_ready::KubePrometheusEnsureReadyScore,
score_kube_prometheus_rule::KubePrometheusRuleScore,
score_kube_prometheus_scrape_target::KubePrometheusScrapeTargetScore,
},
score::Score,
topology::{
K8sAnywhereTopology, PreparationError, PreparationOutcome,
monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget},
},
};
#[async_trait]
impl Observability<KubePrometheus> for K8sAnywhereTopology {
async fn install_alert_sender(
&self,
sender: &KubePrometheus,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
kube_prometheus_helm_chart_score(sender.config.clone())
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
Ok(PreparationOutcome::Success {
details: "Successfully installed kubeprometheus alert sender".to_string(),
})
}
async fn install_receivers(
&self,
sender: &KubePrometheus,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<KubePrometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let receivers = match receivers {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for receiver in receivers {
let score = KubePrometheusReceiverScore {
receiver,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert receivers installed successfully".to_string(),
})
}
async fn install_rules(
&self,
sender: &KubePrometheus,
inventory: &Inventory,
rules: Option<Vec<Box<dyn AlertRule<KubePrometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let rules = match rules {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for rule in rules {
let score = KubePrometheusRuleScore {
sender: sender.clone(),
rule,
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert rules installed successfully".to_string(),
})
}
async fn add_scrape_targets(
&self,
sender: &KubePrometheus,
inventory: &Inventory,
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<KubePrometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let scrape_targets = match scrape_targets {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for scrape_target in scrape_targets {
let score = KubePrometheusScrapeTargetScore {
scrape_target,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All scrap targets installed successfully".to_string(),
})
}
async fn ensure_monitoring_installed(
&self,
sender: &KubePrometheus,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let score = KubePrometheusEnsureReadyScore {
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("KubePrometheus not ready {}", e)))?;
Ok(PreparationOutcome::Success {
details: "KubePrometheus Ready".to_string(),
})
}
}

View File

@@ -0,0 +1,5 @@
pub mod grafana;
pub mod kube_prometheus;
pub mod openshift_monitoring;
pub mod prometheus;
pub mod redhat_cluster_observability;

View File

@@ -0,0 +1,142 @@
use async_trait::async_trait;
use log::info;
use crate::score::Score;
use crate::{
inventory::Inventory,
modules::monitoring::okd::{
OpenshiftClusterAlertSender,
score_enable_cluster_monitoring::OpenshiftEnableClusterMonitoringScore,
score_openshift_alert_rule::OpenshiftAlertRuleScore,
score_openshift_receiver::OpenshiftReceiverScore,
score_openshift_scrape_target::OpenshiftScrapeTargetScore,
score_user_workload::OpenshiftUserWorkloadMonitoring,
score_verify_user_workload_monitoring::VerifyUserWorkload,
},
topology::{
K8sAnywhereTopology, PreparationError, PreparationOutcome,
monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget},
},
};
#[async_trait]
impl Observability<OpenshiftClusterAlertSender> for K8sAnywhereTopology {
async fn install_alert_sender(
&self,
_sender: &OpenshiftClusterAlertSender,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
info!("enabling cluster monitoring");
let cluster_monitoring_score = OpenshiftEnableClusterMonitoringScore {};
cluster_monitoring_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
info!("enabling user workload monitoring");
let user_workload_score = OpenshiftUserWorkloadMonitoring {};
user_workload_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
Ok(PreparationOutcome::Success {
details: "Successfully configured cluster monitoring".to_string(),
})
}
async fn install_receivers(
&self,
_sender: &OpenshiftClusterAlertSender,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<OpenshiftClusterAlertSender>>>>,
) -> Result<PreparationOutcome, PreparationError> {
if let Some(receivers) = receivers {
for receiver in receivers {
info!("Installing receiver {}", receiver.name());
let receiver_score = OpenshiftReceiverScore { receiver };
receiver_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
}
Ok(PreparationOutcome::Success {
details: "Successfully installed receivers for OpenshiftClusterMonitoring"
.to_string(),
})
} else {
Ok(PreparationOutcome::Noop)
}
}
async fn install_rules(
&self,
_sender: &OpenshiftClusterAlertSender,
inventory: &Inventory,
rules: Option<Vec<Box<dyn AlertRule<OpenshiftClusterAlertSender>>>>,
) -> Result<PreparationOutcome, PreparationError> {
if let Some(rules) = rules {
for rule in rules {
info!("Installing rule ");
let rule_score = OpenshiftAlertRuleScore { rule: rule };
rule_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
}
Ok(PreparationOutcome::Success {
details: "Successfully installed rules for OpenshiftClusterMonitoring".to_string(),
})
} else {
Ok(PreparationOutcome::Noop)
}
}
async fn add_scrape_targets(
&self,
_sender: &OpenshiftClusterAlertSender,
inventory: &Inventory,
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<OpenshiftClusterAlertSender>>>>,
) -> Result<PreparationOutcome, PreparationError> {
if let Some(scrape_targets) = scrape_targets {
for scrape_target in scrape_targets {
info!("Installing scrape target");
let scrape_target_score = OpenshiftScrapeTargetScore {
scrape_target: scrape_target,
};
scrape_target_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
}
Ok(PreparationOutcome::Success {
details: "Successfully added scrape targets for OpenshiftClusterMonitoring"
.to_string(),
})
} else {
Ok(PreparationOutcome::Noop)
}
}
async fn ensure_monitoring_installed(
&self,
_sender: &OpenshiftClusterAlertSender,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let verify_monitoring_score = VerifyUserWorkload {};
info!("Verifying user workload and cluster monitoring installed");
verify_monitoring_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError { msg: e.to_string() })?;
Ok(PreparationOutcome::Success {
details: "OpenshiftClusterMonitoring ready".to_string(),
})
}
}

View File

@@ -0,0 +1,147 @@
use async_trait::async_trait;
use crate::{
inventory::Inventory,
modules::monitoring::prometheus::{
Prometheus, score_prometheus_alert_receivers::PrometheusReceiverScore,
score_prometheus_ensure_ready::PrometheusEnsureReadyScore,
score_prometheus_install::PrometheusInstallScore,
score_prometheus_rule::PrometheusRuleScore,
score_prometheus_scrape_target::PrometheusScrapeTargetScore,
},
score::Score,
topology::{
K8sAnywhereTopology, PreparationError, PreparationOutcome,
monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget},
},
};
#[async_trait]
impl Observability<Prometheus> for K8sAnywhereTopology {
async fn install_alert_sender(
&self,
sender: &Prometheus,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let score = PrometheusInstallScore {
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Prometheus not installed {}", e)))?;
Ok(PreparationOutcome::Success {
details: "Successfully installed kubeprometheus alert sender".to_string(),
})
}
async fn install_receivers(
&self,
sender: &Prometheus,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<Prometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let receivers = match receivers {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for receiver in receivers {
let score = PrometheusReceiverScore {
receiver,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert receivers installed successfully".to_string(),
})
}
async fn install_rules(
&self,
sender: &Prometheus,
inventory: &Inventory,
rules: Option<Vec<Box<dyn AlertRule<Prometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let rules = match rules {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for rule in rules {
let score = PrometheusRuleScore {
sender: sender.clone(),
rule,
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All alert rules installed successfully".to_string(),
})
}
async fn add_scrape_targets(
&self,
sender: &Prometheus,
inventory: &Inventory,
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<Prometheus>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let scrape_targets = match scrape_targets {
Some(r) if !r.is_empty() => r,
_ => return Ok(PreparationOutcome::Noop),
};
for scrape_target in scrape_targets {
let score = PrometheusScrapeTargetScore {
scrape_target,
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?;
}
Ok(PreparationOutcome::Success {
details: "All scrap targets installed successfully".to_string(),
})
}
async fn ensure_monitoring_installed(
&self,
sender: &Prometheus,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
let score = PrometheusEnsureReadyScore {
sender: sender.clone(),
};
score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(format!("Prometheus not ready {}", e)))?;
Ok(PreparationOutcome::Success {
details: "Prometheus Ready".to_string(),
})
}
}

View File

@@ -0,0 +1,114 @@
use crate::{
modules::monitoring::red_hat_cluster_observability::{
score_alert_receiver::RedHatClusterObservabilityReceiverScore,
score_coo_monitoring_stack::RedHatClusterObservabilityMonitoringStackScore,
},
score::Score,
};
use async_trait::async_trait;
use log::info;
use crate::{
inventory::Inventory,
modules::monitoring::red_hat_cluster_observability::{
RedHatClusterObservability,
score_redhat_cluster_observability_operator::RedHatClusterObservabilityOperatorScore,
},
topology::{
K8sAnywhereTopology, PreparationError, PreparationOutcome,
monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget},
},
};
#[async_trait]
impl Observability<RedHatClusterObservability> for K8sAnywhereTopology {
async fn install_alert_sender(
&self,
sender: &RedHatClusterObservability,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
info!("Verifying Redhat Cluster Observability Operator");
let coo_score = RedHatClusterObservabilityOperatorScore::default();
coo_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
info!(
"Installing Cluster Observability Operator Monitoring Stack in ns {}",
sender.namespace.clone()
);
let coo_monitoring_stack_score = RedHatClusterObservabilityMonitoringStackScore {
namespace: sender.namespace.clone(),
resource_selector: sender.resource_selector.clone(),
};
coo_monitoring_stack_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
Ok(PreparationOutcome::Success {
details: "Successfully installed RedHatClusterObservability Operator".to_string(),
})
}
async fn install_receivers(
&self,
sender: &RedHatClusterObservability,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<RedHatClusterObservability>>>>,
) -> Result<PreparationOutcome, PreparationError> {
if let Some(receivers) = receivers {
for receiver in receivers {
info!("Installing receiver {}", receiver.name());
let receiver_score = RedHatClusterObservabilityReceiverScore {
receiver,
sender: sender.clone(),
};
receiver_score
.create_interpret()
.execute(inventory, self)
.await
.map_err(|e| PreparationError::new(e.to_string()))?;
}
Ok(PreparationOutcome::Success {
details: "Successfully installed receivers for OpenshiftClusterMonitoring"
.to_string(),
})
} else {
Ok(PreparationOutcome::Noop)
}
}
async fn install_rules(
&self,
_sender: &RedHatClusterObservability,
_inventory: &Inventory,
_rules: Option<Vec<Box<dyn AlertRule<RedHatClusterObservability>>>>,
) -> Result<PreparationOutcome, PreparationError> {
todo!()
}
async fn add_scrape_targets(
&self,
_sender: &RedHatClusterObservability,
_inventory: &Inventory,
_scrape_targets: Option<Vec<Box<dyn ScrapeTarget<RedHatClusterObservability>>>>,
) -> Result<PreparationOutcome, PreparationError> {
todo!()
}
async fn ensure_monitoring_installed(
&self,
_sender: &RedHatClusterObservability,
_inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError> {
todo!()
}
}

View File

@@ -1,6 +1,7 @@
use async_trait::async_trait;
use crate::{
interpret::Outcome,
inventory::Inventory,
modules::postgresql::{
K8sPostgreSQLScore,

View File

@@ -106,7 +106,6 @@ pub enum SSL {
#[derive(Debug, Clone, PartialEq, Serialize)]
pub enum HealthCheck {
/// HTTP(None, "/healthz/ready", HttpMethod::GET, HttpStatusCode::Success2xx, SSL::Disabled)
HTTP(Option<u16>, String, HttpMethod, HttpStatusCode, SSL),
HTTP(String, HttpMethod, HttpStatusCode, SSL),
TCP(Option<u16>),
}

View File

@@ -2,6 +2,7 @@ pub mod decentralized;
mod failover;
mod ha_cluster;
pub mod ingress;
pub mod monitoring;
pub mod node_exporter;
pub mod opnsense;
pub use failover::*;
@@ -11,11 +12,11 @@ mod http;
pub mod installable;
mod k8s_anywhere;
mod localhost;
pub mod oberservability;
pub mod tenant;
use derive_new::new;
pub use k8s_anywhere::*;
pub use localhost::*;
pub mod k8s;
mod load_balancer;
pub mod router;
mod tftp;

View File

@@ -0,0 +1,234 @@
use std::{
any::Any,
collections::{BTreeMap, HashMap},
net::IpAddr,
};
use async_trait::async_trait;
use kube::api::DynamicObject;
use log::{debug, info};
use serde::{Deserialize, Serialize};
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
topology::{PreparationError, PreparationOutcome, Topology, installable::Installable},
};
use harmony_types::id::Id;
/// Defines the application that sends the alerts to a receivers
/// for example prometheus
#[async_trait]
pub trait AlertSender: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
}
/// Trait which defines how an alert sender is impleneted for a specific topology
#[async_trait]
pub trait Observability<S: AlertSender> {
async fn install_alert_sender(
&self,
sender: &S,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError>;
async fn install_receivers(
&self,
sender: &S,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<S>>>>,
) -> Result<PreparationOutcome, PreparationError>;
async fn install_rules(
&self,
sender: &S,
inventory: &Inventory,
rules: Option<Vec<Box<dyn AlertRule<S>>>>,
) -> Result<PreparationOutcome, PreparationError>;
async fn add_scrape_targets(
&self,
sender: &S,
inventory: &Inventory,
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<S>>>>,
) -> Result<PreparationOutcome, PreparationError>;
async fn ensure_monitoring_installed(
&self,
sender: &S,
inventory: &Inventory,
) -> Result<PreparationOutcome, PreparationError>;
}
/// Defines the entity that receives the alerts from a sender. For example Discord, Slack, etc
///
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError>;
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
}
/// Defines a generic rule that can be applied to a sender, such as aprometheus alert rule
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
fn build_rule(&self) -> Result<serde_json::Value, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
}
/// A generic scrape target that can be added to a sender to scrape metrics from, for example a
/// server outside of the cluster
pub trait ScrapeTarget<S: AlertSender>: std::fmt::Debug + Send + Sync {
fn build_scrape_target(&self) -> Result<ExternalScrapeTarget, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn ScrapeTarget<S>>;
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExternalScrapeTarget {
pub ip: IpAddr,
pub port: i32,
pub interval: Option<String>,
pub path: Option<String>,
pub labels: Option<BTreeMap<String, String>>,
}
/// Alerting interpret to install an alert sender on a given topology
#[derive(Debug)]
pub struct AlertingInterpret<S: AlertSender> {
pub sender: S,
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
pub rules: Vec<Box<dyn AlertRule<S>>>,
pub scrape_targets: Option<Vec<Box<dyn ScrapeTarget<S>>>>,
}
#[async_trait]
impl<S: AlertSender, T: Topology + Observability<S>> Interpret<T> for AlertingInterpret<S> {
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
info!("Configuring alert sender {}", self.sender.name());
topology
.install_alert_sender(&self.sender, inventory)
.await?;
info!("Installing receivers");
topology
.install_receivers(&self.sender, inventory, Some(self.receivers.clone()))
.await?;
info!("Installing rules");
topology
.install_rules(&self.sender, inventory, Some(self.rules.clone()))
.await?;
info!("Adding extra scrape targets");
topology
.add_scrape_targets(&self.sender, inventory, self.scrape_targets.clone())
.await?;
info!("Ensuring alert sender {} is ready", self.sender.name());
topology
.ensure_monitoring_installed(&self.sender, inventory)
.await?;
Ok(Outcome::success(format!(
"successfully installed alert sender {}",
self.sender.name()
)))
}
fn get_name(&self) -> InterpretName {
InterpretName::Alerting
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
impl<S: AlertSender> Clone for Box<dyn AlertReceiver<S>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
impl<S: AlertSender> Clone for Box<dyn AlertRule<S>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
impl<S: AlertSender> Clone for Box<dyn ScrapeTarget<S>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
///Generic routing that can map to various alert sender backends
#[derive(Debug, Clone, Serialize)]
pub struct AlertRoute {
pub receiver: String,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub matchers: Vec<AlertMatcher>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub group_by: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub repeat_interval: Option<String>,
#[serde(rename = "continue")]
pub continue_matching: bool,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub children: Vec<AlertRoute>,
}
impl AlertRoute {
pub fn default(name: String) -> Self {
Self {
receiver: name,
matchers: vec![],
group_by: vec![],
repeat_interval: Some("30s".to_string()),
continue_matching: true,
children: vec![],
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct AlertMatcher {
pub label: String,
pub operator: MatchOp,
pub value: String,
}
#[derive(Debug, Clone)]
pub enum MatchOp {
Eq,
NotEq,
Regex,
}
impl Serialize for MatchOp {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let op = match self {
MatchOp::Eq => "=",
MatchOp::NotEq => "!=",
MatchOp::Regex => "=~",
};
serializer.serialize_str(op)
}
}

View File

@@ -9,7 +9,6 @@ use std::{
use async_trait::async_trait;
use brocade::PortOperatingMode;
use derive_new::new;
use harmony_k8s::K8sClient;
use harmony_types::{
id::Id,
net::{IpAddress, MacAddress},
@@ -19,7 +18,7 @@ use serde::Serialize;
use crate::executors::ExecutorError;
use super::LogicalHost;
use super::{LogicalHost, k8s::K8sClient};
#[derive(Debug)]
pub struct DHCPStaticEntry {

View File

@@ -1 +0,0 @@
pub mod monitoring;

View File

@@ -1,101 +0,0 @@
use std::{any::Any, collections::HashMap};
use async_trait::async_trait;
use kube::api::DynamicObject;
use log::debug;
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
topology::{Topology, installable::Installable},
};
use harmony_types::id::Id;
#[async_trait]
pub trait AlertSender: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
}
#[derive(Debug)]
pub struct AlertingInterpret<S: AlertSender> {
pub sender: S,
pub receivers: Vec<Box<dyn AlertReceiver<S>>>,
pub rules: Vec<Box<dyn AlertRule<S>>>,
pub scrape_targets: Option<Vec<Box<dyn ScrapeTarget<S>>>>,
}
#[async_trait]
impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInterpret<S> {
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
debug!("hit sender configure for AlertingInterpret");
self.sender.configure(inventory, topology).await?;
for receiver in self.receivers.iter() {
receiver.install(&self.sender).await?;
}
for rule in self.rules.iter() {
debug!("installing rule: {:#?}", rule);
rule.install(&self.sender).await?;
}
if let Some(targets) = &self.scrape_targets {
for target in targets.iter() {
debug!("installing scrape_target: {:#?}", target);
target.install(&self.sender).await?;
}
}
self.sender.ensure_installed(inventory, topology).await?;
Ok(Outcome::success(format!(
"successfully installed alert sender {}",
self.sender.name()
)))
}
fn get_name(&self) -> InterpretName {
InterpretName::Alerting
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
#[async_trait]
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
fn as_any(&self) -> &dyn Any;
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String>;
}
#[derive(Debug)]
pub struct AlertManagerReceiver {
pub receiver_config: serde_json::Value,
// FIXME we should not leak k8s here. DynamicObject is k8s specific
pub additional_ressources: Vec<DynamicObject>,
pub route_config: serde_json::Value,
}
#[async_trait]
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
}
#[async_trait]
pub trait ScrapeTarget<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn clone_box(&self) -> Box<dyn ScrapeTarget<S>>;
}

View File

@@ -1,8 +1,10 @@
use std::sync::Arc;
use crate::executors::ExecutorError;
use crate::{
executors::ExecutorError,
topology::k8s::{ApplyStrategy, K8sClient},
};
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use k8s_openapi::{
api::{
core::v1::{LimitRange, Namespace, ResourceQuota},
@@ -12,7 +14,7 @@ use k8s_openapi::{
},
apimachinery::pkg::util::intstr::IntOrString,
};
use kube::Resource;
use kube::{Resource, api::DynamicObject};
use log::debug;
use serde::de::DeserializeOwned;
use serde_json::json;
@@ -57,6 +59,7 @@ impl K8sTenantManager {
) -> Result<K, ExecutorError>
where
<K as kube::Resource>::DynamicType: Default,
<K as kube::Resource>::Scope: ApplyStrategy<K>,
{
self.apply_labels(&mut resource, config);
self.k8s_client

View File

@@ -5,7 +5,6 @@ use std::{
use askama::Template;
use async_trait::async_trait;
use harmony_k8s::{DrainOptions, K8sClient, NodeFile};
use harmony_types::id::Id;
use k8s_openapi::api::core::v1::Node;
use kube::{
@@ -16,7 +15,10 @@ use log::{debug, info, warn};
use crate::{
modules::okd::crd::nmstate,
topology::{HostNetworkConfig, NetworkError, NetworkManager},
topology::{
HostNetworkConfig, NetworkError, NetworkManager,
k8s::{DrainOptions, K8sClient, NodeFile},
},
};
/// NetworkManager bond configuration template

View File

@@ -216,15 +216,7 @@ pub(crate) fn get_health_check_for_backend(
SSL::Other(other.to_string())
}
};
let port = haproxy_health_check
.checkport
.content_string()
.parse::<u16>()
.ok();
debug!("Found haproxy healthcheck port {port:?}");
Some(HealthCheck::HTTP(port, path, method, status_code, ssl))
Some(HealthCheck::HTTP(path, method, status_code, ssl))
}
_ => panic!("Received unsupported health check type {}", uppercase),
}
@@ -259,7 +251,7 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
// frontend points to backend
let healthcheck = if let Some(health_check) = &service.health_check {
match health_check {
HealthCheck::HTTP(port, path, http_method, _http_status_code, ssl) => {
HealthCheck::HTTP(path, http_method, _http_status_code, ssl) => {
let ssl: MaybeString = match ssl {
SSL::SSL => "ssl".into(),
SSL::SNI => "sslni".into(),
@@ -275,7 +267,6 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
http_uri: path.clone().into(),
interval: "2s".to_string(),
ssl,
checkport: MaybeString::from(port.map(|p| p.to_string())),
..Default::default()
};

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use log::{debug, info};
use log::{debug, info, trace};
use serde::Serialize;
use std::path::PathBuf;

View File

@@ -1,5 +1,4 @@
use async_trait::async_trait;
use harmony_k8s::K8sClient;
use harmony_macros::hurl;
use log::{debug, info, trace, warn};
use non_blank_string_rs::NonBlankString;
@@ -15,7 +14,7 @@ use crate::{
helm::chart::{HelmChartScore, HelmRepository},
},
score::Score,
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress},
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress, k8s::K8sClient},
};
use harmony_types::id::Id;

View File

@@ -2,13 +2,15 @@ use crate::modules::application::{
Application, ApplicationFeature, InstallationError, InstallationOutcome,
};
use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore;
use crate::modules::monitoring::grafana::grafana::Grafana;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
};
use crate::modules::monitoring::prometheus::Prometheus;
use crate::modules::monitoring::prometheus::helm::prometheus_config::PrometheusConfig;
use crate::topology::MultiTargetTopology;
use crate::topology::ingress::Ingress;
use crate::topology::monitoring::AlertReceiver;
use crate::topology::monitoring::Observability;
use crate::{
inventory::Inventory,
modules::monitoring::{
@@ -17,10 +19,6 @@ use crate::{
score::Score,
topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager},
};
use crate::{
modules::prometheus::prometheus::PrometheusMonitoring,
topology::oberservability::monitoring::AlertReceiver,
};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use harmony_secret::SecretManager;
@@ -30,12 +28,12 @@ use kube::api::ObjectMeta;
use log::{debug, info};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::sync::{Arc, Mutex};
#[derive(Debug, Clone)]
pub struct Monitoring {
pub application: Arc<dyn Application>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<Prometheus>>>,
}
#[async_trait]
@@ -46,8 +44,7 @@ impl<
+ TenantManager
+ K8sclient
+ MultiTargetTopology
+ PrometheusMonitoring<CRDPrometheus>
+ Grafana
+ Observability<Prometheus>
+ Ingress
+ std::fmt::Debug,
> ApplicationFeature<T> for Monitoring
@@ -74,10 +71,8 @@ impl<
};
let mut alerting_score = ApplicationMonitoringScore {
sender: CRDPrometheus {
namespace: namespace.clone(),
client: topology.k8s_client().await.unwrap(),
service_monitor: vec![app_service_monitor],
sender: Prometheus {
config: Arc::new(Mutex::new(PrometheusConfig::new())),
},
application: self.application.clone(),
receivers: self.alert_receiver.clone(),
@@ -119,11 +114,12 @@ impl<
),
};
alerting_score.receivers.push(Box::new(ntfy_receiver));
alerting_score
.interpret(&Inventory::empty(), topology)
.await
.map_err(|e| e.to_string())?;
todo!();
// alerting_score.receivers.push(Box::new(ntfy_receiver));
// alerting_score
// .interpret(&Inventory::empty(), topology)
// .await
// .map_err(|e| e.to_string())?;
Ok(InstallationOutcome::success())
}

View File

@@ -3,11 +3,13 @@ use std::sync::Arc;
use crate::modules::application::{
Application, ApplicationFeature, InstallationError, InstallationOutcome,
};
use crate::modules::monitoring::application_monitoring::rhobs_application_monitoring_score::ApplicationRHOBMonitoringScore;
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability;
use crate::modules::monitoring::red_hat_cluster_observability::redhat_cluster_observability::RedHatClusterObservabilityScore;
use crate::topology::MultiTargetTopology;
use crate::topology::ingress::Ingress;
use crate::topology::monitoring::AlertReceiver;
use crate::topology::monitoring::Observability;
use crate::{
inventory::Inventory,
modules::monitoring::{
@@ -16,10 +18,6 @@ use crate::{
score::Score,
topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager},
};
use crate::{
modules::prometheus::prometheus::PrometheusMonitoring,
topology::oberservability::monitoring::AlertReceiver,
};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use harmony_types::net::Url;
@@ -28,9 +26,10 @@ use log::{debug, info};
#[derive(Debug, Clone)]
pub struct Monitoring {
pub application: Arc<dyn Application>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<RedHatClusterObservability>>>,
}
///TODO TEST this
#[async_trait]
impl<
T: Topology
@@ -41,7 +40,7 @@ impl<
+ MultiTargetTopology
+ Ingress
+ std::fmt::Debug
+ PrometheusMonitoring<RHOBObservability>,
+ Observability<RedHatClusterObservability>,
> ApplicationFeature<T> for Monitoring
{
async fn ensure_installed(
@@ -55,13 +54,14 @@ impl<
.map(|ns| ns.name.clone())
.unwrap_or_else(|| self.application.name());
let mut alerting_score = ApplicationRHOBMonitoringScore {
sender: RHOBObservability {
let mut alerting_score = RedHatClusterObservabilityScore {
sender: RedHatClusterObservability {
namespace: namespace.clone(),
client: topology.k8s_client().await.unwrap(),
resource_selector: todo!(),
},
application: self.application.clone(),
receivers: self.alert_receiver.clone(),
rules: vec![],
scrape_targets: None,
};
let domain = topology
.get_domain("ntfy")

View File

@@ -1,9 +1,8 @@
use std::sync::Arc;
use harmony_k8s::K8sClient;
use log::{debug, info};
use crate::interpret::InterpretError;
use crate::{interpret::InterpretError, topology::k8s::K8sClient};
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ArgoScope {

View File

@@ -44,12 +44,6 @@ pub struct BrocadeSwitchAuth {
pub password: String,
}
impl BrocadeSwitchAuth {
pub fn user_pass(username: String, password: String) -> Self {
Self { username, password }
}
}
#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
pub struct BrocadeSnmpAuth {
pub username: String,

View File

@@ -1,4 +1,3 @@
use harmony_k8s::K8sClient;
use std::sync::Arc;
use async_trait::async_trait;
@@ -12,7 +11,7 @@ use crate::{
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score,
topology::{K8sclient, Topology},
topology::{K8sclient, Topology, k8s::K8sClient},
};
#[derive(Clone, Debug, Serialize)]

View File

@@ -54,12 +54,6 @@ pub enum HarmonyDiscoveryStrategy {
SUBNET { cidr: cidr::Ipv4Cidr, port: u16 },
}
impl Default for HarmonyDiscoveryStrategy {
fn default() -> Self {
HarmonyDiscoveryStrategy::MDNS
}
}
#[async_trait]
impl<T: Topology> Interpret<T> for DiscoverInventoryAgentInterpret {
async fn execute(

View File

@@ -3,8 +3,7 @@ use std::sync::Arc;
use async_trait::async_trait;
use log::warn;
use crate::topology::{FailoverTopology, K8sclient};
use harmony_k8s::K8sClient;
use crate::topology::{FailoverTopology, K8sclient, k8s::K8sClient};
#[async_trait]
impl<T: K8sclient> K8sclient for FailoverTopology<T> {

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use k8s_openapi::ResourceScope;
use k8s_openapi::NamespaceResourceScope;
use kube::Resource;
use log::info;
use serde::{Serialize, de::DeserializeOwned};
@@ -9,7 +9,7 @@ use crate::{
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
score::Score,
topology::{K8sclient, Topology},
topology::{K8sclient, Topology, k8s::ApplyStrategy},
};
use harmony_types::id::Id;
@@ -29,7 +29,7 @@ impl<K: Resource + std::fmt::Debug> K8sResourceScore<K> {
}
impl<
K: Resource<Scope: ResourceScope>
K: Resource
+ std::fmt::Debug
+ Sync
+ DeserializeOwned
@@ -42,6 +42,7 @@ impl<
> Score<T> for K8sResourceScore<K>
where
<K as kube::Resource>::DynamicType: Default,
<K as kube::Resource>::Scope: ApplyStrategy<K>,
{
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(K8sResourceInterpret {
@@ -61,7 +62,7 @@ pub struct K8sResourceInterpret<K: Resource + std::fmt::Debug + Sync + Send> {
#[async_trait]
impl<
K: Resource<Scope: ResourceScope>
K: Resource
+ Clone
+ std::fmt::Debug
+ DeserializeOwned
@@ -73,6 +74,7 @@ impl<
> Interpret<T> for K8sResourceInterpret<K>
where
<K as kube::Resource>::DynamicType: Default,
<K as kube::Resource>::Scope: ApplyStrategy<K>,
{
async fn execute(
&self,
@@ -109,7 +111,7 @@ where
topology
.k8s_client()
.await
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
.expect("Environment should provide enough information to instanciate a client")
.apply_many(&self.score.resource, self.score.namespace.as_deref())
.await?;

View File

@@ -15,13 +15,9 @@ pub mod load_balancer;
pub mod monitoring;
pub mod nats;
pub mod network;
pub mod node_health;
pub mod okd;
pub mod openbao;
pub mod opnsense;
pub mod postgresql;
pub mod prometheus;
pub mod storage;
pub mod tenant;
pub mod tftp;
pub mod zitadel;

View File

@@ -1,98 +1,54 @@
use std::any::Any;
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use async_trait::async_trait;
use harmony_types::k8s_name::K8sName;
use crate::modules::monitoring::kube_prometheus::KubePrometheus;
use crate::modules::monitoring::okd::OpenshiftClusterAlertSender;
use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability;
use crate::topology::monitoring::{AlertRoute, MatchOp};
use crate::{interpret::InterpretError, topology::monitoring::AlertReceiver};
use harmony_types::net::Url;
use k8s_openapi::api::core::v1::Secret;
use kube::Resource;
use kube::api::{DynamicObject, ObjectMeta};
use log::{debug, trace};
use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value};
use crate::infra::kube::kube_resource_to_dynamic;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
};
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
use crate::modules::monitoring::okd::OpenshiftClusterAlertSender;
use crate::topology::oberservability::monitoring::AlertManagerReceiver;
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::{
kube_prometheus::{
prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
},
prometheus::prometheus::{Prometheus, PrometheusReceiver},
},
topology::oberservability::monitoring::AlertReceiver,
};
use harmony_types::net::Url;
#[derive(Debug, Clone, Serialize)]
pub struct DiscordWebhook {
pub name: K8sName,
pub struct DiscordReceiver {
pub name: String,
pub url: Url,
pub selectors: Vec<HashMap<String, String>>,
pub route: AlertRoute,
}
impl DiscordWebhook {
fn get_receiver_config(&self) -> Result<AlertManagerReceiver, String> {
let secret_name = format!("{}-secret", self.name.clone());
let webhook_key = format!("{}", self.url.clone());
impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordReceiver {
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
let matchers: Vec<String> = self
.route
.matchers
.iter()
.map(|m| match m.operator {
MatchOp::Eq => format!("{} = {}", m.label, m.value),
MatchOp::NotEq => format!("{} != {}", m.label, m.value),
MatchOp::Regex => format!("{} =~ {}", m.label, m.value),
})
.collect();
let mut string_data = BTreeMap::new();
string_data.insert("webhook-url".to_string(), webhook_key.clone());
let secret = Secret {
metadata: kube::core::ObjectMeta {
name: Some(secret_name.clone()),
..Default::default()
},
string_data: Some(string_data),
type_: Some("Opaque".to_string()),
..Default::default()
};
let mut matchers: Vec<String> = Vec::new();
for selector in &self.selectors {
trace!("selector: {:#?}", selector);
for (k, v) in selector {
matchers.push(format!("{} = {}", k, v));
}
}
Ok(AlertManagerReceiver {
additional_ressources: vec![kube_resource_to_dynamic(&secret)?],
receiver_config: json!({
"name": self.name,
"discord_configs": [
{
"webhook_url": self.url.clone(),
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}
]
}),
route_config: json!({
"receiver": self.name,
"matchers": matchers,
}),
})
let route_block = serde_yaml::to_value(json!({
"receiver": self.name,
"matchers": matchers,
}))
.unwrap();
Ok(route_block)
}
}
#[async_trait]
impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordWebhook {
async fn install(
&self,
sender: &OpenshiftClusterAlertSender,
) -> Result<Outcome, InterpretError> {
todo!()
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver_block = serde_yaml::to_value(json!({
"name": self.name,
"discord_configs": [{
"webhook_url": format!("{}", self.url),
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}]
}))
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(receiver_block)
}
fn name(&self) -> String {
@@ -102,93 +58,16 @@ impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordWebhook {
fn clone_box(&self) -> Box<dyn AlertReceiver<OpenshiftClusterAlertSender>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
todo!()
}
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
self.get_receiver_config()
}
}
#[async_trait]
impl AlertReceiver<RHOBObservability> for DiscordWebhook {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
impl AlertReceiver<RedHatClusterObservability> for DiscordReceiver {
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
serde_yaml::to_value(&self.route).map_err(|e| InterpretError::new(e.to_string()))
}
async fn install(&self, sender: &RHOBObservability) -> Result<Outcome, InterpretError> {
let ns = sender.namespace.clone();
let config = self.get_receiver_config()?;
for resource in config.additional_ressources.iter() {
todo!("can I apply a dynamicresource");
// sender.client.apply(resource, Some(&ns)).await;
}
let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
config.receiver_config
]
}),
};
let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone().to_string()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alertmanager_configs yaml:\n{:#?}",
serde_yaml::to_string(&alertmanager_configs)
);
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed rhob-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<RHOBObservability>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<CRDPrometheus> for DiscordWebhook {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
}
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let ns = sender.namespace.clone();
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
//FIXME this secret needs to be applied so that the discord Configs for RedHatCO
//CRD AlertmanagerConfigs can access the URL
let secret_name = format!("{}-secret", self.name.clone());
let webhook_key = format!("{}", self.url.clone());
@@ -205,206 +84,54 @@ impl AlertReceiver<CRDPrometheus> for DiscordWebhook {
..Default::default()
};
let _ = sender.client.apply(&secret, Some(&ns)).await;
let spec = AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"discordConfigs": [
{
"apiURL": {
"name": secret_name,
"key": "webhook-url",
},
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}
]
}
]
}),
};
let alertmanager_configs = AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone().to_string()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(ns),
..Default::default()
},
spec,
};
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
let receiver_config = json!({
"name": self.name,
"discordConfigs": [
{
"apiURL": {
"key": "webhook-url",
"name": format!("{}-secret", self.name)
},
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}
]
});
serde_yaml::to_value(receiver_config).map_err(|e| InterpretError::new(e.to_string()))
}
fn name(&self) -> String {
"discord-webhook".to_string()
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
fn clone_box(&self) -> Box<dyn AlertReceiver<RedHatClusterObservability>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<Prometheus> for DiscordWebhook {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
impl AlertReceiver<KubePrometheus> for DiscordReceiver {
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
serde_yaml::to_value(self.route.clone()).map_err(|e| InterpretError::new(e.to_string()))
}
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver_block = serde_yaml::to_value(json!({
"name": self.name,
"discord_configs": [{
"webhook_url": format!("{}", self.url),
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}]
}))
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(receiver_block)
}
fn name(&self) -> String {
"discord-webhook".to_string()
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl PrometheusReceiver for DiscordWebhook {
fn name(&self) -> String {
self.name.clone().to_string()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
}
}
#[async_trait]
impl AlertReceiver<KubePrometheus> for DiscordWebhook {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
}
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl KubePrometheusReceiver for DiscordWebhook {
fn name(&self) -> String {
self.name.clone().to_string()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
}
}
#[async_trait]
impl AlertChannelConfig for DiscordWebhook {
async fn get_config(&self) -> AlertManagerChannelConfig {
let channel_global_config = None;
let channel_receiver = self.alert_channel_receiver().await;
let channel_route = self.alert_channel_route().await;
AlertManagerChannelConfig {
channel_global_config,
channel_receiver,
channel_route,
}
}
}
impl DiscordWebhook {
async fn alert_channel_route(&self) -> serde_yaml::Value {
let mut route = Mapping::new();
route.insert(
Value::String("receiver".to_string()),
Value::String(self.name.clone().to_string()),
);
route.insert(
Value::String("matchers".to_string()),
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
);
route.insert(Value::String("continue".to_string()), Value::Bool(true));
Value::Mapping(route)
}
async fn alert_channel_receiver(&self) -> serde_yaml::Value {
let mut receiver = Mapping::new();
receiver.insert(
Value::String("name".to_string()),
Value::String(self.name.clone().to_string()),
);
let mut discord_config = Mapping::new();
discord_config.insert(
Value::String("webhook_url".to_string()),
Value::String(self.url.to_string()),
);
receiver.insert(
Value::String("discord_configs".to_string()),
Value::Sequence(vec![Value::Mapping(discord_config)]),
);
Value::Mapping(receiver)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn discord_serialize_should_match() {
let discord_receiver = DiscordWebhook {
name: K8sName("test-discord".to_string()),
url: Url::Url(url::Url::parse("https://discord.i.dont.exist.com").unwrap()),
selectors: vec![],
};
let discord_receiver_receiver =
serde_yaml::to_string(&discord_receiver.alert_channel_receiver().await).unwrap();
println!("receiver \n{:#}", discord_receiver_receiver);
let discord_receiver_receiver_yaml = r#"name: test-discord
discord_configs:
- webhook_url: https://discord.i.dont.exist.com/
"#
.to_string();
let discord_receiver_route =
serde_yaml::to_string(&discord_receiver.alert_channel_route().await).unwrap();
println!("route \n{:#}", discord_receiver_route);
let discord_receiver_route_yaml = r#"receiver: test-discord
matchers:
- alertname!=Watchdog
continue: true
"#
.to_string();
assert_eq!(discord_receiver_receiver, discord_receiver_receiver_yaml);
assert_eq!(discord_receiver_route, discord_receiver_route_yaml);
}
}

View File

@@ -1,25 +1,13 @@
use std::any::Any;
use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::debug;
use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value};
use crate::{
interpret::{InterpretError, Outcome},
interpret::InterpretError,
modules::monitoring::{
kube_prometheus::{
crd::{
crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability,
},
prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
},
prometheus::prometheus::{Prometheus, PrometheusReceiver},
kube_prometheus::KubePrometheus, okd::OpenshiftClusterAlertSender, prometheus::Prometheus,
red_hat_cluster_observability::RedHatClusterObservability,
},
topology::oberservability::monitoring::{AlertManagerReceiver, AlertReceiver},
topology::monitoring::AlertReceiver,
};
use harmony_types::net::Url;
@@ -29,279 +17,104 @@ pub struct WebhookReceiver {
pub url: Url,
}
#[async_trait]
impl AlertReceiver<RHOBObservability> for WebhookReceiver {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
}
async fn install(&self, sender: &RHOBObservability) -> Result<Outcome, InterpretError> {
let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
"httpConfig": {
"tlsConfig": {
"insecureSkipVerify": true
}
}
}
]
impl WebhookReceiver {
fn build_receiver(&self) -> serde_json::Value {
json!({
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
"httpConfig": {
"tlsConfig": {
"insecureSkipVerify": true
}
]
}),
};
let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed rhob-alertmanagerconfigs for {}",
self.name
)))
}
}
]})
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<RHOBObservability>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
fn build_route(&self) -> serde_json::Value {
json!({
"name": self.name})
}
}
#[async_trait]
impl AlertReceiver<CRDPrometheus> for WebhookReceiver {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
}
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let spec = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
}
]
}
]
}),
};
let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
impl AlertReceiver<OpenshiftClusterAlertSender> for WebhookReceiver {
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver = self.build_receiver();
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
let route = self.build_route();
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<Prometheus> for WebhookReceiver {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
}
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl PrometheusReceiver for WebhookReceiver {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
fn clone_box(&self) -> Box<dyn AlertReceiver<OpenshiftClusterAlertSender>> {
Box::new(self.clone())
}
}
#[async_trait]
impl AlertReceiver<KubePrometheus> for WebhookReceiver {
fn as_alertmanager_receiver(&self) -> Result<AlertManagerReceiver, String> {
todo!()
impl AlertReceiver<RedHatClusterObservability> for WebhookReceiver {
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver = self.build_receiver();
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
}
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
let route = self.build_route();
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<RedHatClusterObservability>> {
Box::new(self.clone())
}
}
impl AlertReceiver<KubePrometheus> for WebhookReceiver {
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver = self.build_receiver();
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
}
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
let route = self.build_route();
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
}
fn name(&self) -> String {
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl KubePrometheusReceiver for WebhookReceiver {
impl AlertReceiver<Prometheus> for WebhookReceiver {
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
let receiver = self.build_receiver();
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
}
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
let route = self.build_route();
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
}
fn name(&self) -> String {
self.name.clone()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
}
}
#[async_trait]
impl AlertChannelConfig for WebhookReceiver {
async fn get_config(&self) -> AlertManagerChannelConfig {
let channel_global_config = None;
let channel_receiver = self.alert_channel_receiver().await;
let channel_route = self.alert_channel_route().await;
AlertManagerChannelConfig {
channel_global_config,
channel_receiver,
channel_route,
}
}
}
impl WebhookReceiver {
async fn alert_channel_route(&self) -> serde_yaml::Value {
let mut route = Mapping::new();
route.insert(
Value::String("receiver".to_string()),
Value::String(self.name.clone()),
);
route.insert(
Value::String("matchers".to_string()),
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
);
route.insert(Value::String("continue".to_string()), Value::Bool(true));
Value::Mapping(route)
}
async fn alert_channel_receiver(&self) -> serde_yaml::Value {
let mut receiver = Mapping::new();
receiver.insert(
Value::String("name".to_string()),
Value::String(self.name.clone()),
);
let mut webhook_config = Mapping::new();
webhook_config.insert(
Value::String("url".to_string()),
Value::String(self.url.to_string()),
);
receiver.insert(
Value::String("webhook_configs".to_string()),
Value::Sequence(vec![Value::Mapping(webhook_config)]),
);
Value::Mapping(receiver)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn webhook_serialize_should_match() {
let webhook_receiver = WebhookReceiver {
name: "test-webhook".to_string(),
url: Url::Url(url::Url::parse("https://webhook.i.dont.exist.com").unwrap()),
};
let webhook_receiver_receiver =
serde_yaml::to_string(&webhook_receiver.alert_channel_receiver().await).unwrap();
println!("receiver \n{:#}", webhook_receiver_receiver);
let webhook_receiver_receiver_yaml = r#"name: test-webhook
webhook_configs:
- url: https://webhook.i.dont.exist.com/
"#
.to_string();
let webhook_receiver_route =
serde_yaml::to_string(&webhook_receiver.alert_channel_route().await).unwrap();
println!("route \n{:#}", webhook_receiver_route);
let webhook_receiver_route_yaml = r#"receiver: test-webhook
matchers:
- alertname!=Watchdog
continue: true
"#
.to_string();
assert_eq!(webhook_receiver_receiver, webhook_receiver_receiver_yaml);
assert_eq!(webhook_receiver_route, webhook_receiver_route_yaml);
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone())
}
}

View File

@@ -0,0 +1,15 @@
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn high_http_error_rate() -> PrometheusAlertRule {
let expression = r#"(
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job, route, service)
/
sum(rate(http_requests_total[5m])) by (job, route, service)
) > 0.05 and sum(rate(http_requests_total[5m])) by (job, route, service) > 10"#;
PrometheusAlertRule::new("HighApplicationErrorRate", expression)
.for_duration("10m")
.label("severity", "warning")
.annotation("summary", "High HTTP error rate on {{ $labels.job }}")
.annotation("description", "Job {{ $labels.job }} (route {{ $labels.route }}) has an error rate > 5% over the last 10m.")
}

View File

@@ -1 +1,2 @@
pub mod alerts;
pub mod prometheus_alert_rule;

View File

@@ -1,79 +1,13 @@
use std::collections::{BTreeMap, HashMap};
use std::collections::HashMap;
use async_trait::async_trait;
use serde::Serialize;
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::{
kube_prometheus::{
prometheus::{KubePrometheus, KubePrometheusRule},
types::{AlertGroup, AlertManagerAdditionalPromRules},
},
prometheus::prometheus::{Prometheus, PrometheusRule},
},
topology::oberservability::monitoring::AlertRule,
interpret::InterpretError,
modules::monitoring::{kube_prometheus::KubePrometheus, okd::OpenshiftClusterAlertSender},
topology::monitoring::AlertRule,
};
#[async_trait]
impl AlertRule<KubePrometheus> for AlertManagerRuleGroup {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_rule(self).await
}
fn clone_box(&self) -> Box<dyn AlertRule<KubePrometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_rule(self).await
}
fn clone_box(&self) -> Box<dyn AlertRule<Prometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl PrometheusRule for AlertManagerRuleGroup {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
let mut additional_prom_rules = BTreeMap::new();
additional_prom_rules.insert(
self.name.clone(),
AlertGroup {
groups: vec![self.clone()],
},
);
AlertManagerAdditionalPromRules {
rules: additional_prom_rules,
}
}
}
#[async_trait]
impl KubePrometheusRule for AlertManagerRuleGroup {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
let mut additional_prom_rules = BTreeMap::new();
additional_prom_rules.insert(
self.name.clone(),
AlertGroup {
groups: vec![self.clone()],
},
);
AlertManagerAdditionalPromRules {
rules: additional_prom_rules,
}
}
}
impl AlertManagerRuleGroup {
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {
AlertManagerRuleGroup {
@@ -129,3 +63,55 @@ impl PrometheusAlertRule {
self
}
}
impl AlertRule<OpenshiftClusterAlertSender> for AlertManagerRuleGroup {
fn build_rule(&self) -> Result<serde_json::Value, InterpretError> {
let name = self.name.clone();
let mut rules: Vec<crate::modules::monitoring::okd::crd::alerting_rules::Rule> = vec![];
for rule in self.rules.clone() {
rules.push(rule.into())
}
let rule_groups =
vec![crate::modules::monitoring::okd::crd::alerting_rules::RuleGroup { name, rules }];
Ok(serde_json::to_value(rule_groups).map_err(|e| InterpretError::new(e.to_string()))?)
}
fn name(&self) -> String {
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertRule<OpenshiftClusterAlertSender>> {
Box::new(self.clone())
}
}
impl AlertRule<KubePrometheus> for AlertManagerRuleGroup {
fn build_rule(&self) -> Result<serde_json::Value, InterpretError> {
let name = self.name.clone();
let mut rules: Vec<
crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::Rule,
> = vec![];
for rule in self.rules.clone() {
rules.push(rule.into())
}
let rule_groups = vec![
crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup {
name,
rules,
},
];
Ok(serde_json::to_value(rule_groups).map_err(|e| InterpretError::new(e.to_string()))?)
}
fn name(&self) -> String {
self.name.clone()
}
fn clone_box(&self) -> Box<dyn AlertRule<KubePrometheus>> {
Box::new(self.clone())
}
}

View File

@@ -5,32 +5,26 @@ use serde::Serialize;
use crate::{
interpret::Interpret,
modules::{
application::Application,
monitoring::{
grafana::grafana::Grafana, kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus,
},
prometheus::prometheus::PrometheusMonitoring,
},
modules::{application::Application, monitoring::prometheus::Prometheus},
score::Score,
topology::{
K8sclient, Topology,
oberservability::monitoring::{AlertReceiver, AlertingInterpret, ScrapeTarget},
monitoring::{AlertReceiver, AlertingInterpret, Observability, ScrapeTarget},
},
};
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationMonitoringScore {
pub sender: CRDPrometheus,
pub sender: Prometheus,
pub application: Arc<dyn Application>,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
}
impl<T: Topology + PrometheusMonitoring<CRDPrometheus> + K8sclient + Grafana> Score<T>
for ApplicationMonitoringScore
{
impl<T: Topology + Observability<Prometheus> + K8sclient> Score<T> for ApplicationMonitoringScore {
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
debug!("creating alerting interpret");
//TODO will need to use k8sclient to apply service monitors or find a way to pass
//them to the AlertingInterpret potentially via Sender Prometheus
Box::new(AlertingInterpret {
sender: self.sender.clone(),
receivers: self.receivers.clone(),

View File

@@ -9,28 +9,27 @@ use crate::{
inventory::Inventory,
modules::{
application::Application,
monitoring::kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability,
},
prometheus::prometheus::PrometheusMonitoring,
monitoring::red_hat_cluster_observability::RedHatClusterObservability,
},
score::Score,
topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver},
topology::{
Topology,
monitoring::{AlertReceiver, AlertingInterpret, Observability},
},
};
use harmony_types::id::Id;
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationRHOBMonitoringScore {
pub sender: RHOBObservability,
pub struct ApplicationRedHatClusterMonitoringScore {
pub sender: RedHatClusterObservability,
pub application: Arc<dyn Application>,
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
pub receivers: Vec<Box<dyn AlertReceiver<RedHatClusterObservability>>>,
}
impl<T: Topology + PrometheusMonitoring<RHOBObservability>> Score<T>
for ApplicationRHOBMonitoringScore
impl<T: Topology + Observability<RedHatClusterObservability>> Score<T>
for ApplicationRedHatClusterMonitoringScore
{
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(ApplicationRHOBMonitoringInterpret {
Box::new(ApplicationRedHatClusterMonitoringInterpret {
score: self.clone(),
})
}
@@ -44,38 +43,28 @@ impl<T: Topology + PrometheusMonitoring<RHOBObservability>> Score<T>
}
#[derive(Debug)]
pub struct ApplicationRHOBMonitoringInterpret {
score: ApplicationRHOBMonitoringScore,
pub struct ApplicationRedHatClusterMonitoringInterpret {
score: ApplicationRedHatClusterMonitoringScore,
}
#[async_trait]
impl<T: Topology + PrometheusMonitoring<RHOBObservability>> Interpret<T>
for ApplicationRHOBMonitoringInterpret
impl<T: Topology + Observability<RedHatClusterObservability>> Interpret<T>
for ApplicationRedHatClusterMonitoringInterpret
{
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let result = topology
.install_prometheus(
&self.score.sender,
inventory,
Some(self.score.receivers.clone()),
)
.await;
match result {
Ok(outcome) => match outcome {
PreparationOutcome::Success { details: _ } => {
Ok(Outcome::success("Prometheus installed".into()))
}
PreparationOutcome::Noop => {
Ok(Outcome::noop("Prometheus installation skipped".into()))
}
},
Err(err) => Err(InterpretError::from(err)),
}
//TODO will need to use k8sclient to apply crd ServiceMonitor or find a way to pass
//them to the AlertingInterpret potentially via Sender RedHatClusterObservability
let alerting_interpret = AlertingInterpret {
sender: self.score.sender.clone(),
receivers: self.score.receivers.clone(),
rules: vec![],
scrape_targets: None,
};
alerting_interpret.execute(inventory, topology).await
}
fn get_name(&self) -> InterpretName {

View File

@@ -1,6 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: observability
labels:
openshift.io/cluster-monitoring: "true"

View File

@@ -1,43 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-grafana-sa
namespace: observability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: grafana-prometheus-api-access
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- prometheuses/api
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: grafana-prometheus-api-access-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana-prometheus-api-access
subjects:
- kind: ServiceAccount
name: cluster-grafana-sa
namespace: observability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: grafana-cluster-monitoring-view
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-monitoring-view
subjects:
- kind: ServiceAccount
name: cluster-grafana-sa
namespace: observability

View File

@@ -1,43 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: Grafana
metadata:
name: cluster-grafana
namespace: observability
labels:
dashboards: "grafana"
spec:
serviceAccountName: cluster-grafana-sa
automountServiceAccountToken: true
config:
log:
mode: console
security:
admin_user: admin
admin_password: paul
users:
viewers_can_edit: "false"
auth:
disable_login_form: "false"
auth.anonymous:
enabled: "true"
org_role: Viewer
deployment:
spec:
replicas: 1
template:
spec:
containers:
- name: grafana
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1
memory: 2Gi

View File

@@ -1,8 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: grafana-prometheus-token
namespace: observability
annotations:
kubernetes.io/service-account.name: cluster-grafana-sa
type: kubernetes.io/service-account-token

View File

@@ -1,27 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: prometheus-cluster
namespace: observability
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
valuesFrom:
- targetPath: "secureJsonData.httpHeaderValue1"
valueFrom:
secretKeyRef:
name: grafana-prometheus-token
key: token
datasource:
name: Prometheus-Cluster
type: prometheus
access: proxy
url: https://prometheus-k8s.openshift-monitoring.svc:9091
isDefault: true
jsonData:
httpHeaderName1: "Authorization"
tlsSkipVerify: true
timeInterval: "30s"
secureJsonData:
httpHeaderValue1: "Bearer ${token}"

View File

@@ -1,14 +0,0 @@
apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: grafana
namespace: observability
spec:
to:
kind: Service
name: cluster-grafana-service
port:
targetPort: 3000
tls:
termination: edge
insecureEdgeTerminationPolicy: Redirect

View File

@@ -1,97 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: cluster-overview
namespace: observability
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"title": "Cluster Overview",
"schemaVersion": 36,
"version": 1,
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"type": "stat",
"title": "Ready Nodes",
"datasource": {
"type": "prometheus",
"uid": "Prometheus-Cluster"
},
"targets": [
{
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
"refId": "A"
}
],
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }
},
{
"type": "stat",
"title": "Running Pods",
"datasource": {
"type": "prometheus",
"uid": "Prometheus-Cluster"
},
"targets": [
{
"expr": "count(kube_pod_status_phase{phase=\"Running\"})",
"refId": "A"
}
],
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }
},
{
"type": "timeseries",
"title": "Cluster CPU Usage (%)",
"datasource": {
"type": "prometheus",
"uid": "Prometheus-Cluster"
},
"targets": [
{
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }
},
{
"type": "timeseries",
"title": "Cluster Memory Usage (%)",
"datasource": {
"type": "prometheus",
"uid": "Prometheus-Cluster"
},
"targets": [
{
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }
}
]
}

View File

@@ -1,769 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: okd-cluster-overview
namespace: observability
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"title": "Cluster Overview",
"uid": "okd-cluster-overview",
"schemaVersion": 36,
"version": 2,
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"tags": ["okd", "cluster", "overview"],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Ready Nodes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 1)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
},
{
"id": 2,
"type": "stat",
"title": "Not Ready Nodes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\"} == 1) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
},
{
"id": 3,
"type": "stat",
"title": "Running Pods",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
},
{
"id": 4,
"type": "stat",
"title": "Pending Pods",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 5 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
},
{
"id": 5,
"type": "stat",
"title": "Failed Pods",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
},
{
"id": 6,
"type": "stat",
"title": "CrashLoopBackOff",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} == 1) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
},
{
"id": 7,
"type": "stat",
"title": "Critical Alerts",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\"}) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
},
{
"id": 8,
"type": "stat",
"title": "Warning Alerts",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\"}) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 10 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
},
{
"id": 9,
"type": "gauge",
"title": "CPU Usage",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"refId": "A",
"legendFormat": "CPU"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showThresholdLabels": false,
"showThresholdMarkers": true,
"orientation": "auto"
},
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 4 }
},
{
"id": 10,
"type": "gauge",
"title": "Memory Usage",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)))",
"refId": "A",
"legendFormat": "Memory"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 75 },
{ "color": "red", "value": 90 }
]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showThresholdLabels": false,
"showThresholdMarkers": true,
"orientation": "auto"
},
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 4 }
},
{
"id": 11,
"type": "gauge",
"title": "Root Disk Usage",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"})))",
"refId": "A",
"legendFormat": "Disk"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 85 }
]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showThresholdLabels": false,
"showThresholdMarkers": true,
"orientation": "auto"
},
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 4 }
},
{
"id": 12,
"type": "stat",
"title": "etcd Has Leader",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "min(etcd_server_has_leader)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
},
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "NO LEADER", "color": "red" },
"1": { "text": "LEADER OK", "color": "green" }
}
}
],
"unit": "short",
"noValue": "?"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 4 }
},
{
"id": 13,
"type": "stat",
"title": "API Servers Up",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum(up{job=\"apiserver\"})",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "green", "value": 2 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 4 }
},
{
"id": 14,
"type": "stat",
"title": "etcd Members Up",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum(up{job=\"etcd\"})",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 2 },
{ "color": "green", "value": 3 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 3, "w": 5, "x": 14, "y": 7 }
},
{
"id": 15,
"type": "stat",
"title": "Operators Degraded",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(cluster_operator_conditions{condition=\"Degraded\",status=\"True\"} == 1) or vector(0)",
"refId": "A",
"legendFormat": ""
}
],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short",
"noValue": "0"
}
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"gridPos": { "h": 3, "w": 5, "x": 19, "y": 7 }
},
{
"id": 16,
"type": "timeseries",
"title": "CPU Usage per Node (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"spanNulls": false,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": {
"displayMode": "list",
"placement": "bottom",
"calcs": ["mean", "max"]
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }
},
{
"id": 17,
"type": "timeseries",
"title": "Memory Usage per Node (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"spanNulls": false,
"showPoints": "never"
}
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": {
"displayMode": "list",
"placement": "bottom",
"calcs": ["mean", "max"]
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }
},
{
"id": 18,
"type": "timeseries",
"title": "Network Traffic — Cluster Total",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
"refId": "A",
"legendFormat": "Receive"
},
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br-int|br-ex\"}[5m]))",
"refId": "B",
"legendFormat": "Transmit"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"color": { "mode": "palette-classic" },
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"spanNulls": false,
"showPoints": "never"
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Receive" },
"properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "Transmit" },
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
}
]
},
"options": {
"tooltip": { "mode": "multi", "sort": "none" },
"legend": {
"displayMode": "list",
"placement": "bottom",
"calcs": ["mean", "max"]
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }
},
{
"id": 19,
"type": "timeseries",
"title": "Pod Phases Over Time",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count(kube_pod_status_phase{phase=\"Running\"} == 1)",
"refId": "A",
"legendFormat": "Running"
},
{
"expr": "count(kube_pod_status_phase{phase=\"Pending\"} == 1) or vector(0)",
"refId": "B",
"legendFormat": "Pending"
},
{
"expr": "count(kube_pod_status_phase{phase=\"Failed\"} == 1) or vector(0)",
"refId": "C",
"legendFormat": "Failed"
},
{
"expr": "count(kube_pod_status_phase{phase=\"Unknown\"} == 1) or vector(0)",
"refId": "D",
"legendFormat": "Unknown"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"custom": {
"lineWidth": 2,
"fillOpacity": 15,
"spanNulls": false,
"showPoints": "never"
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Running" },
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "Pending" },
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "Failed" },
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "Unknown" },
"properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }]
}
]
},
"options": {
"tooltip": { "mode": "multi", "sort": "none" },
"legend": {
"displayMode": "list",
"placement": "bottom",
"calcs": ["lastNotNull"]
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }
}
]
}

View File

@@ -1,637 +0,0 @@
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: okd-node-health
namespace: observability
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"title": "Node Health",
"uid": "okd-node-health",
"schemaVersion": 36,
"version": 2,
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"tags": ["okd", "node", "health"],
"templating": {
"list": [
{
"name": "node",
"type": "query",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"query": { "query": "label_values(kube_node_info, node)", "refId": "A" },
"refresh": 2,
"includeAll": true,
"multi": true,
"allValue": ".*",
"label": "Node",
"sort": 1,
"current": {},
"options": []
}
]
},
"panels": [
{
"id": 1,
"type": "stat",
"title": "Total Nodes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_info{node=~\"$node\"})", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 0, "y": 0 }
},
{
"id": 2,
"type": "stat",
"title": "Ready Nodes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"} == 1)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 3, "y": 0 }
},
{
"id": 3,
"type": "stat",
"title": "Not Ready Nodes",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"false\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 6, "y": 0 }
},
{
"id": 4,
"type": "stat",
"title": "Memory Pressure",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 9, "y": 0 }
},
{
"id": 5,
"type": "stat",
"title": "Disk Pressure",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 12, "y": 0 }
},
{
"id": 6,
"type": "stat",
"title": "PID Pressure",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 15, "y": 0 }
},
{
"id": 7,
"type": "stat",
"title": "Unschedulable",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(kube_node_spec_unschedulable{node=~\"$node\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 18, "y": 0 }
},
{
"id": 8,
"type": "stat",
"title": "Kubelet Up",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [{ "expr": "count(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1) or vector(0)", "refId": "A", "legendFormat": "" }],
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
"unit": "short", "noValue": "0"
}
},
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "reduceOptions": { "calcs": ["lastNotNull"] }, "textMode": "auto" },
"gridPos": { "h": 4, "w": 3, "x": 21, "y": 0 }
},
{
"id": 9,
"type": "table",
"title": "Node Conditions",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum by(node) (kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"$node\"})",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
},
{
"expr": "sum by(node) (kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\",node=~\"$node\"})",
"refId": "B",
"legendFormat": "{{node}}",
"instant": true
},
{
"expr": "sum by(node) (kube_node_status_condition{condition=\"DiskPressure\",status=\"true\",node=~\"$node\"})",
"refId": "C",
"legendFormat": "{{node}}",
"instant": true
},
{
"expr": "sum by(node) (kube_node_status_condition{condition=\"PIDPressure\",status=\"true\",node=~\"$node\"})",
"refId": "D",
"legendFormat": "{{node}}",
"instant": true
},
{
"expr": "sum by(node) (kube_node_spec_unschedulable{node=~\"$node\"})",
"refId": "E",
"legendFormat": "{{node}}",
"instant": true
}
],
"transformations": [
{
"id": "labelsToFields",
"options": { "mode": "columns" }
},
{
"id": "joinByField",
"options": { "byField": "node", "mode": "outer" }
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Time 1": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
"Time 5": true
},
"renameByName": {
"node": "Node",
"Value #A": "Ready",
"Value #B": "Mem Pressure",
"Value #C": "Disk Pressure",
"Value #D": "PID Pressure",
"Value #E": "Unschedulable"
},
"indexByName": {
"node": 0,
"Value #A": 1,
"Value #B": 2,
"Value #C": 3,
"Value #D": 4,
"Value #E": 5
}
}
}
],
"fieldConfig": {
"defaults": {
"custom": { "displayMode": "color-background", "align": "center" }
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Node" },
"properties": [
{ "id": "custom.displayMode", "value": "auto" },
{ "id": "custom.align", "value": "left" },
{ "id": "custom.width", "value": 200 }
]
},
{
"matcher": { "id": "byName", "options": "Ready" },
"properties": [
{
"id": "thresholds",
"value": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
},
{ "id": "custom.displayMode", "value": "color-background" },
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": { "text": "✗ Not Ready", "color": "red", "index": 0 },
"1": { "text": "✓ Ready", "color": "green", "index": 1 }
}
}
]
}
]
},
{
"matcher": { "id": "byRegexp", "options": ".*Pressure" },
"properties": [
{
"id": "thresholds",
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
},
{ "id": "custom.displayMode", "value": "color-background" },
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": { "text": "✓ OK", "color": "green", "index": 0 },
"1": { "text": "⚠ Active", "color": "red", "index": 1 }
}
}
]
}
]
},
{
"matcher": { "id": "byName", "options": "Unschedulable" },
"properties": [
{
"id": "thresholds",
"value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }] }
},
{ "id": "custom.displayMode", "value": "color-background" },
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": { "text": "✓ Schedulable", "color": "green", "index": 0 },
"1": { "text": "⚠ Cordoned", "color": "yellow", "index": 1 }
}
}
]
}
]
}
]
},
"options": { "sortBy": [{ "displayName": "Node", "desc": false }] },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 }
},
{
"id": 10,
"type": "timeseries",
"title": "CPU Usage per Node (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
},
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 12 }
},
{
"id": 11,
"type": "bargauge",
"title": "CPU Usage \u2014 Current",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": true,
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }
},
{
"id": 12,
"type": "timeseries",
"title": "Memory Usage per Node (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
},
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 20 }
},
{
"id": 13,
"type": "bargauge",
"title": "Memory Usage \u2014 Current",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 }] }
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": true,
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 }
},
{
"id": 14,
"type": "timeseries",
"title": "Root Disk Usage per Node (%)",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": false },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
},
"gridPos": { "h": 8, "w": 16, "x": 0, "y": 28 }
},
{
"id": 15,
"type": "bargauge",
"title": "Root Disk Usage \u2014 Current",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"tmpfs\"}))",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent", "min": 0, "max": 100,
"color": { "mode": "thresholds" },
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 }] }
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": true,
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 28 }
},
{
"id": 16,
"type": "timeseries",
"title": "Network Traffic per Node",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "sum by(instance) (rate(node_network_receive_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
"refId": "A",
"legendFormat": "rx {{instance}}"
},
{
"expr": "sum by(instance) (rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|tun.*|ovn.*|br.*\"}[5m]))",
"refId": "B",
"legendFormat": "tx {{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }
},
{
"id": 17,
"type": "bargauge",
"title": "Pods per Node",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "count by(node) (kube_pod_info{node=~\"$node\"})",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"min": 0,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 100 },
{ "color": "red", "value": 200 }
]
}
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": true,
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }
},
{
"id": 18,
"type": "timeseries",
"title": "System Load Average (1m) per Node",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "node_load1",
"refId": "A",
"legendFormat": "1m \u2014 {{instance}}"
},
{
"expr": "node_load5",
"refId": "B",
"legendFormat": "5m \u2014 {{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"min": 0,
"color": { "mode": "palette-classic" },
"custom": { "lineWidth": 2, "fillOpacity": 5, "showPoints": "never", "spanNulls": false }
}
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] }
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }
},
{
"id": 19,
"type": "bargauge",
"title": "Node Uptime",
"datasource": { "type": "prometheus", "uid": "Prometheus-Cluster" },
"targets": [
{
"expr": "time() - node_boot_time_seconds",
"refId": "A",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"min": 0,
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 300 },
{ "color": "green", "value": 3600 }
]
}
}
},
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": false,
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }
}
]
}

Some files were not shown because too many files have changed in this diff Show More