Monitor an application within a tenant (#86)
All checks were successful
Run Check Script / check (push) Successful in -45s
Compile and package harmony_composer / package_harmony_composer (push) Successful in 4m35s

WIP: added implementation to deploy crd-alertmanagerconfigs
Co-authored-by: Ian Letourneau <letourneau.ian@gmail.com>
Reviewed-on: https://git.nationtech.io/NationTech/harmony/pulls/86
Co-authored-by: Willem <wrolleman@nationtech.io>
Co-committed-by: Willem <wrolleman@nationtech.io>
This commit is contained in:
Willem 2025-08-04 21:42:01 +00:00 committed by Ian Letourneau
parent 54990cd1a5
commit 024084859e
43 changed files with 2292 additions and 177 deletions

86
Cargo.lock generated
View File

@ -96,6 +96,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "ansi_term"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b3568b48b7cefa6b8ce125f9bb4989e52fbcc29ebea88df04cc7c5f12f70455"
[[package]] [[package]]
name = "anstream" name = "anstream"
version = "0.6.19" version = "0.6.19"
@ -1259,6 +1265,18 @@ dependencies = [
name = "example" name = "example"
version = "0.0.0" version = "0.0.0"
[[package]]
name = "example-application-monitoring-with-tenant"
version = "0.1.0"
dependencies = [
"env_logger",
"harmony",
"harmony_cli",
"logging",
"tokio",
"url",
]
[[package]] [[package]]
name = "example-cli" name = "example-cli"
version = "0.1.0" version = "0.1.0"
@ -1779,6 +1797,7 @@ dependencies = [
"k3d-rs", "k3d-rs",
"k8s-openapi", "k8s-openapi",
"kube", "kube",
"kube-derive",
"lazy_static", "lazy_static",
"libredfish", "libredfish",
"log", "log",
@ -1791,6 +1810,7 @@ dependencies = [
"reqwest 0.11.27", "reqwest 0.11.27",
"russh", "russh",
"rust-ipmi", "rust-ipmi",
"schemars 0.8.22",
"semver", "semver",
"serde", "serde",
"serde-value", "serde-value",
@ -2669,6 +2689,7 @@ dependencies = [
"k8s-openapi", "k8s-openapi",
"kube-client", "kube-client",
"kube-core", "kube-core",
"kube-derive",
"kube-runtime", "kube-runtime",
] ]
@ -2722,12 +2743,27 @@ dependencies = [
"http 1.3.1", "http 1.3.1",
"json-patch", "json-patch",
"k8s-openapi", "k8s-openapi",
"schemars 0.8.22",
"serde", "serde",
"serde-value", "serde-value",
"serde_json", "serde_json",
"thiserror 2.0.12", "thiserror 2.0.12",
] ]
[[package]]
name = "kube-derive"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "079fc8c1c397538628309cfdee20696ebdcc26745f9fb17f89b78782205bd995"
dependencies = [
"darling",
"proc-macro2",
"quote",
"serde",
"serde_json",
"syn",
]
[[package]] [[package]]
name = "kube-runtime" name = "kube-runtime"
version = "1.1.0" version = "1.1.0"
@ -2843,6 +2879,15 @@ dependencies = [
"log", "log",
] ]
[[package]]
name = "logging"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "461a8beca676e8ab1bd468c92e9b4436d6368e11e96ae038209e520cfe665e46"
dependencies = [
"ansi_term",
]
[[package]] [[package]]
name = "lru" name = "lru"
version = "0.12.5" version = "0.12.5"
@ -4140,6 +4185,18 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "schemars"
version = "0.8.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
dependencies = [
"dyn-clone",
"schemars_derive",
"serde",
"serde_json",
]
[[package]] [[package]]
name = "schemars" name = "schemars"
version = "0.9.0" version = "0.9.0"
@ -4154,9 +4211,9 @@ dependencies = [
[[package]] [[package]]
name = "schemars" name = "schemars"
version = "1.0.3" version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1375ba8ef45a6f15d83fa8748f1079428295d403d6ea991d09ab100155fbc06d" checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0"
dependencies = [ dependencies = [
"dyn-clone", "dyn-clone",
"ref-cast", "ref-cast",
@ -4164,6 +4221,18 @@ dependencies = [
"serde_json", "serde_json",
] ]
[[package]]
name = "schemars_derive"
version = "0.8.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d"
dependencies = [
"proc-macro2",
"quote",
"serde_derive_internals",
"syn",
]
[[package]] [[package]]
name = "scopeguard" name = "scopeguard"
version = "1.2.0" version = "1.2.0"
@ -4296,6 +4365,17 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "serde_derive_internals"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "serde_json" name = "serde_json"
version = "1.0.140" version = "1.0.140"
@ -4374,7 +4454,7 @@ dependencies = [
"indexmap 1.9.3", "indexmap 1.9.3",
"indexmap 2.10.0", "indexmap 2.10.0",
"schemars 0.9.0", "schemars 0.9.0",
"schemars 1.0.3", "schemars 1.0.4",
"serde", "serde",
"serde_derive", "serde_derive",
"serde_json", "serde_json",

View File

@ -0,0 +1,14 @@
[package]
name = "example-application-monitoring-with-tenant"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
env_logger.workspace = true
harmony = { version = "0.1.0", path = "../../harmony" }
harmony_cli = { version = "0.1.0", path = "../../harmony_cli" }
logging = "0.1.0"
tokio.workspace = true
url.workspace = true

View File

@ -0,0 +1,61 @@
use std::{path::PathBuf, sync::Arc};
use harmony::{
data::Id,
inventory::Inventory,
maestro::Maestro,
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, Monitoring},
},
monitoring::alert_channel::{
discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
},
tenant::TenantScore,
},
topology::{K8sAnywhereTopology, Url, tenant::TenantConfig},
};
#[tokio::main]
async fn main() {
//TODO there is a bug where the application is deployed into the namespace matching the
//application name and the tenant is created in the namesapce matching the tenant name
//in order for the application to be deployed in the tenant namespace the application.name and
//the TenantConfig.name must match
let tenant = TenantScore {
config: TenantConfig {
id: Id::from_str("test-tenant-id"),
name: "example-monitoring".to_string(),
..Default::default()
},
};
let application = Arc::new(RustWebapp {
name: "example-monitoring".to_string(),
domain: Url::Url(url::Url::parse("https://rustapp.harmony.example.com").unwrap()),
project_root: PathBuf::from("./examples/rust/webapp"),
framework: Some(RustWebFramework::Leptos),
});
let webhook_receiver = WebhookReceiver {
name: "sample-webhook-receiver".to_string(),
url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
};
let app = ApplicationScore {
features: vec![Box::new(Monitoring {
alert_receiver: vec![Box::new(webhook_receiver)],
application: application.clone(),
})],
application,
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(tenant), Box::new(app)],
None,
)
.await
.unwrap();
}

View File

@ -50,8 +50,8 @@ async fn main() {
let service_monitor_endpoint = ServiceMonitorEndpoint { let service_monitor_endpoint = ServiceMonitorEndpoint {
port: Some("80".to_string()), port: Some("80".to_string()),
path: "/metrics".to_string(), path: Some("/metrics".to_string()),
scheme: HTTPScheme::HTTP, scheme: Some(HTTPScheme::HTTP),
..Default::default() ..Default::default()
}; };

View File

@ -53,8 +53,8 @@ async fn main() {
let service_monitor_endpoint = ServiceMonitorEndpoint { let service_monitor_endpoint = ServiceMonitorEndpoint {
port: Some("80".to_string()), port: Some("80".to_string()),
path: "/metrics".to_string(), path: Some("/metrics".to_string()),
scheme: HTTPScheme::HTTP, scheme: Some(HTTPScheme::HTTP),
..Default::default() ..Default::default()
}; };

View File

@ -2,9 +2,14 @@ use std::{path::PathBuf, sync::Arc};
use harmony::{ use harmony::{
inventory::Inventory, inventory::Inventory,
modules::application::{ modules::{
ApplicationScore, RustWebFramework, RustWebapp, application::{
features::{ContinuousDelivery, Monitoring}, ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, Monitoring},
},
monitoring::alert_channel::{
discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
},
}, },
topology::{K8sAnywhereTopology, Url}, topology::{K8sAnywhereTopology, Url},
}; };
@ -18,6 +23,16 @@ async fn main() {
framework: Some(RustWebFramework::Leptos), framework: Some(RustWebFramework::Leptos),
}); });
let discord_receiver = DiscordWebhook {
name: "test-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
};
let webhook_receiver = WebhookReceiver {
name: "sample-webhook-receiver".to_string(),
url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
};
let app = ApplicationScore { let app = ApplicationScore {
features: vec![ features: vec![
Box::new(ContinuousDelivery { Box::new(ContinuousDelivery {
@ -25,7 +40,9 @@ async fn main() {
}), }),
Box::new(Monitoring { Box::new(Monitoring {
application: application.clone(), application: application.clone(),
}), // TODO: add backups, multisite ha, etc. alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
}),
// TODO add backups, multisite ha, etc
], ],
application, application,
}; };

View File

@ -27,7 +27,7 @@ harmony_macros = { path = "../harmony_macros" }
harmony_types = { path = "../harmony_types" } harmony_types = { path = "../harmony_types" }
uuid.workspace = true uuid.workspace = true
url.workspace = true url.workspace = true
kube.workspace = true kube = { workspace = true, features = ["derive"] }
k8s-openapi.workspace = true k8s-openapi.workspace = true
serde_yaml.workspace = true serde_yaml.workspace = true
http.workspace = true http.workspace = true
@ -58,6 +58,8 @@ tokio-util = "0.7.15"
strum = { version = "0.27.1", features = ["derive"] } strum = { version = "0.27.1", features = ["derive"] }
tempfile = "3.20.0" tempfile = "3.20.0"
serde_with = "3.14.0" serde_with = "3.14.0"
schemars = "0.8.22"
kube-derive = "1.1.0"
bollard.workspace = true bollard.workspace = true
tar.workspace = true tar.workspace = true
base64.workspace = true base64.workspace = true

View File

@ -17,7 +17,7 @@ use kube::{
runtime::wait::await_condition, runtime::wait::await_condition,
}; };
use log::{debug, error, trace}; use log::{debug, error, trace};
use serde::de::DeserializeOwned; use serde::{Serialize, de::DeserializeOwned};
use similar::{DiffableStr, TextDiff}; use similar::{DiffableStr, TextDiff};
#[derive(new, Clone)] #[derive(new, Clone)]
@ -25,6 +25,15 @@ pub struct K8sClient {
client: Client, client: Client,
} }
impl Serialize for K8sClient {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl std::fmt::Debug for K8sClient { impl std::fmt::Debug for K8sClient {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// This is a poor man's debug implementation for now as kube::Client does not provide much // This is a poor man's debug implementation for now as kube::Client does not provide much

View File

@ -7,22 +7,33 @@ use tokio::sync::OnceCell;
use crate::{ use crate::{
executors::ExecutorError, executors::ExecutorError,
interpret::{InterpretError, Outcome}, interpret::{InterpretError, InterpretStatus, Outcome},
inventory::Inventory, inventory::Inventory,
modules::k3d::K3DInstallationScore, modules::{
k3d::K3DInstallationScore,
monitoring::kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus,
prometheus_operator::prometheus_operator_helm_chart_score,
},
prometheus::{
k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore,
prometheus::PrometheusApplicationMonitoring,
},
},
score::Score, score::Score,
}; };
use super::{ use super::{
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, Topology, DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, Topology,
k8s::K8sClient, k8s::K8sClient,
oberservability::monitoring::AlertReceiver,
tenant::{TenantConfig, TenantManager, k8s::K8sTenantManager}, tenant::{TenantConfig, TenantManager, k8s::K8sTenantManager},
}; };
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct K8sState { struct K8sState {
client: Arc<K8sClient>, client: Arc<K8sClient>,
_source: K8sSource, source: K8sSource,
message: String, message: String,
} }
@ -56,8 +67,32 @@ impl K8sclient for K8sAnywhereTopology {
} }
} }
#[async_trait]
impl PrometheusApplicationMonitoring<CRDPrometheus> for K8sAnywhereTopology {
async fn install_prometheus(
&self,
sender: &CRDPrometheus,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
) -> Result<Outcome, InterpretError> {
let po_result = self.ensure_prometheus_operator(sender).await?;
if po_result.status == InterpretStatus::NOOP {
debug!("Skipping Prometheus CR installation due to missing operator.");
return Ok(Outcome::noop());
}
self.get_k8s_prometheus_application_score(sender.clone(), receivers)
.await
.create_interpret()
.execute(inventory, self)
.await?;
Ok(Outcome::success(format!("No action, working on cluster ")))
}
}
impl Serialize for K8sAnywhereTopology { impl Serialize for K8sAnywhereTopology {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where where
S: serde::Serializer, S: serde::Serializer,
{ {
@ -82,6 +117,19 @@ impl K8sAnywhereTopology {
} }
} }
async fn get_k8s_prometheus_application_score(
&self,
sender: CRDPrometheus,
receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
) -> K8sPrometheusCRDAlertingScore {
K8sPrometheusCRDAlertingScore {
sender,
receivers: receivers.unwrap_or_else(Vec::new),
service_monitors: vec![],
prometheus_rules: vec![],
}
}
fn is_helm_available(&self) -> Result<(), String> { fn is_helm_available(&self) -> Result<(), String> {
let version_result = Command::new("helm") let version_result = Command::new("helm")
.arg("version") .arg("version")
@ -132,7 +180,7 @@ impl K8sAnywhereTopology {
Some(client) => { Some(client) => {
return Ok(Some(K8sState { return Ok(Some(K8sState {
client: Arc::new(client), client: Arc::new(client),
_source: K8sSource::Kubeconfig, source: K8sSource::Kubeconfig,
message: format!("Loaded k8s client from kubeconfig {kubeconfig}"), message: format!("Loaded k8s client from kubeconfig {kubeconfig}"),
})); }));
} }
@ -174,7 +222,7 @@ impl K8sAnywhereTopology {
let state = match k3d.get_client().await { let state = match k3d.get_client().await {
Ok(client) => K8sState { Ok(client) => K8sState {
client: Arc::new(K8sClient::new(client)), client: Arc::new(K8sClient::new(client)),
_source: K8sSource::LocalK3d, source: K8sSource::LocalK3d,
message: "K8s client ready".to_string(), message: "K8s client ready".to_string(),
}, },
Err(_) => todo!(), Err(_) => todo!(),
@ -190,6 +238,7 @@ impl K8sAnywhereTopology {
self.tenant_manager self.tenant_manager
.get_or_try_init(async || -> Result<K8sTenantManager, String> { .get_or_try_init(async || -> Result<K8sTenantManager, String> {
// TOOD: checker si K8s ou K3d/s tenant manager (ref. issue https://git.nationtech.io/NationTech/harmony/issues/94)
let k8s_client = self.k8s_client().await?; let k8s_client = self.k8s_client().await?;
Ok(K8sTenantManager::new(k8s_client)) Ok(K8sTenantManager::new(k8s_client))
}) })
@ -206,6 +255,48 @@ impl K8sAnywhereTopology {
)), )),
} }
} }
async fn ensure_prometheus_operator(
&self,
sender: &CRDPrometheus,
) -> Result<Outcome, InterpretError> {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i prometheuses"])
.status()
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))?;
if !status.success() {
if let Some(Some(k8s_state)) = self.k8s_state.get() {
match k8s_state.source {
K8sSource::LocalK3d => {
debug!("installing prometheus operator");
let op_score =
prometheus_operator_helm_chart_score(sender.namespace.clone());
op_score
.create_interpret()
.execute(&Inventory::empty(), self)
.await?;
return Ok(Outcome::success(
"installed prometheus operator".to_string(),
));
}
K8sSource::Kubeconfig => {
debug!("unable to install prometheus operator, contact cluster admin");
return Ok(Outcome::noop());
}
}
} else {
warn!("Unable to detect k8s_state. Skipping Prometheus Operator install.");
return Ok(Outcome::noop());
}
}
debug!("Prometheus operator is already present, skipping install");
Ok(Outcome::success(
"prometheus operator present in cluster".to_string(),
))
}
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]

View File

@ -1,3 +1,5 @@
use std::any::Any;
use async_trait::async_trait; use async_trait::async_trait;
use log::debug; use log::debug;
@ -62,7 +64,9 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
#[async_trait] #[async_trait]
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync { pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>; async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>; fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
fn as_any(&self) -> &dyn Any;
} }
#[async_trait] #[async_trait]
@ -72,6 +76,6 @@ pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
} }
#[async_trait] #[async_trait]
pub trait ScrapeTarger<S: AlertSender> { pub trait ScrapeTarget<S: AlertSender> {
async fn install(&self, sender: &S) -> Result<(), InterpretError>; async fn install(&self, sender: &S) -> Result<(), InterpretError>;
} }

View File

@ -231,8 +231,13 @@ impl K8sTenantManager {
{ {
"to": [ "to": [
{ {
//TODO this ip is from the docker network that k3d is running on
//since k3d does not deploy kube-api-server as a pod it needs to ahve the ip
//address opened up
//need to find a way to automatically detect the ip address from the docker
//network
"ipBlock": { "ipBlock": {
"cidr": "172.23.0.0/16", "cidr": "172.24.0.0/16",
} }
} }
] ]

View File

@ -1,45 +1,60 @@
use std::sync::Arc; use std::sync::Arc;
use async_trait::async_trait; use crate::modules::application::{Application, ApplicationFeature};
use base64::{Engine as _, engine::general_purpose}; use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore;
use log::{debug, info}; use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
use crate::{ use crate::{
inventory::Inventory, inventory::Inventory,
modules::{ modules::monitoring::{
application::{ApplicationFeature, OCICompliant}, alert_channel::webhook_receiver::WebhookReceiver, ntfy::ntfy::NtfyScore,
monitoring::{
alert_channel::webhook_receiver::WebhookReceiver,
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{NamespaceSelector, ServiceMonitor},
},
ntfy::ntfy::NtfyScore,
},
}, },
score::Score, score::Score,
topology::{HelmCommand, K8sclient, Topology, Url, tenant::TenantManager}, topology::{HelmCommand, K8sclient, Topology, Url, tenant::TenantManager},
}; };
use crate::{
modules::prometheus::prometheus::PrometheusApplicationMonitoring,
topology::oberservability::monitoring::AlertReceiver,
};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use log::{debug, info};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Monitoring { pub struct Monitoring {
pub application: Arc<dyn OCICompliant>, pub application: Arc<dyn Application>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
} }
#[async_trait] #[async_trait]
impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> ApplicationFeature<T> impl<
for Monitoring T: Topology
+ HelmCommand
+ 'static
+ TenantManager
+ K8sclient
+ std::fmt::Debug
+ PrometheusApplicationMonitoring<CRDPrometheus>,
> ApplicationFeature<T> for Monitoring
{ {
async fn ensure_installed(&self, topology: &T) -> Result<(), String> { async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application"); info!("Ensuring monitoring is available for application");
let namespace = topology
.get_tenant_config()
.await
.map(|ns| ns.name.clone())
.unwrap_or_else(|| self.application.name());
let mut alerting_score = ApplicationMonitoringScore {
sender: CRDPrometheus {
namespace: namespace.clone(),
client: topology.k8s_client().await.unwrap(),
},
application: self.application.clone(),
receivers: self.alert_receiver.clone(),
};
let ntfy = NtfyScore { let ntfy = NtfyScore {
// namespace: topology namespace: namespace.clone(),
// .get_tenant_config()
// .await
// .expect("couldn't get tenant config")
// .name,
namespace: self.application.name(),
host: "localhost".to_string(), host: "localhost".to_string(),
}; };
ntfy.create_interpret() ntfy.create_interpret()
@ -70,7 +85,7 @@ impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> Applicatio
url::Url::parse( url::Url::parse(
format!( format!(
"http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}", "http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}",
self.application.name() namespace.clone()
) )
.as_str(), .as_str(),
) )
@ -78,26 +93,7 @@ impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> Applicatio
), ),
}; };
let mut service_monitor = ServiceMonitor::default(); alerting_score.receivers.push(Box::new(ntfy_receiver));
service_monitor.namespace_selector = Some(NamespaceSelector {
any: true,
match_names: vec![],
});
service_monitor.name = "rust-webapp".to_string();
// let alerting_score = ApplicationPrometheusMonitoringScore {
// receivers: vec![Box::new(ntfy_receiver)],
// rules: vec![],
// service_monitors: vec![service_monitor],
// };
let alerting_score = HelmPrometheusAlertingScore {
receivers: vec![Box::new(ntfy_receiver)],
rules: vec![],
service_monitors: vec![service_monitor],
};
alerting_score alerting_score
.create_interpret() .create_interpret()
.execute(&Inventory::empty(), topology) .execute(&Inventory::empty(), topology)

View File

@ -10,6 +10,7 @@ pub use oci::*;
pub use rust::*; pub use rust::*;
use async_trait::async_trait; use async_trait::async_trait;
use serde::Serialize;
use crate::{ use crate::{
data::{Id, Version}, data::{Id, Version},
@ -78,3 +79,12 @@ impl<A: Application, T: Topology + std::fmt::Debug> Interpret<T> for Application
todo!() todo!()
} }
} }
impl Serialize for dyn Application {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}

View File

@ -1,7 +1,16 @@
use std::any::Any;
use std::collections::BTreeMap;
use async_trait::async_trait; use async_trait::async_trait;
use k8s_openapi::api::core::v1::Secret;
use kube::api::ObjectMeta;
use serde::Serialize; use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value}; use serde_yaml::{Mapping, Value};
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
};
use crate::{ use crate::{
interpret::{InterpretError, Outcome}, interpret::{InterpretError, Outcome},
modules::monitoring::{ modules::monitoring::{
@ -20,14 +29,98 @@ pub struct DiscordWebhook {
pub url: Url, pub url: Url,
} }
#[async_trait]
impl AlertReceiver<CRDPrometheus> for DiscordWebhook {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let ns = sender.namespace.clone();
let secret_name = format!("{}-secret", self.name.clone());
let webhook_key = format!("{}", self.url.clone());
let mut string_data = BTreeMap::new();
string_data.insert("webhook-url".to_string(), webhook_key.clone());
let secret = Secret {
metadata: kube::core::ObjectMeta {
name: Some(secret_name.clone()),
..Default::default()
},
string_data: Some(string_data),
type_: Some("Opaque".to_string()),
..Default::default()
};
let _ = sender.client.apply(&secret, Some(&ns)).await;
let spec = AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"discordConfigs": [
{
"apiURL": {
"name": secret_name,
"key": "webhook-url",
},
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}
]
}
]
}),
};
let alertmanager_configs = AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(ns),
..Default::default()
},
spec,
};
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait] #[async_trait]
impl AlertReceiver<Prometheus> for DiscordWebhook { impl AlertReceiver<Prometheus> for DiscordWebhook {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await sender.install_receiver(self).await
} }
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> { fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone()) Box::new(self.clone())
} }
fn as_any(&self) -> &dyn Any {
self
}
} }
#[async_trait] #[async_trait]
@ -48,6 +141,12 @@ impl AlertReceiver<KubePrometheus> for DiscordWebhook {
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> { fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone()) Box::new(self.clone())
} }
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
} }
#[async_trait] #[async_trait]

View File

@ -1,11 +1,19 @@
use std::any::Any;
use async_trait::async_trait; use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::debug;
use serde::Serialize; use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value}; use serde_yaml::{Mapping, Value};
use crate::{ use crate::{
interpret::{InterpretError, Outcome}, interpret::{InterpretError, Outcome},
modules::monitoring::{ modules::monitoring::{
kube_prometheus::{ kube_prometheus::{
crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
},
prometheus::{KubePrometheus, KubePrometheusReceiver}, prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig}, types::{AlertChannelConfig, AlertManagerChannelConfig},
}, },
@ -20,14 +28,81 @@ pub struct WebhookReceiver {
pub url: Url, pub url: Url,
} }
#[async_trait]
impl AlertReceiver<CRDPrometheus> for WebhookReceiver {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let spec = AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
}
]
}
]
}),
};
let alertmanager_configs = AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait] #[async_trait]
impl AlertReceiver<Prometheus> for WebhookReceiver { impl AlertReceiver<Prometheus> for WebhookReceiver {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await sender.install_receiver(self).await
} }
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> { fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone()) Box::new(self.clone())
} }
fn as_any(&self) -> &dyn Any {
self
}
} }
#[async_trait] #[async_trait]
@ -44,9 +119,15 @@ impl AlertReceiver<KubePrometheus> for WebhookReceiver {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await sender.install_receiver(self).await
} }
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> { fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone()) Box::new(self.clone())
} }
fn as_any(&self) -> &dyn Any {
self
}
} }
#[async_trait] #[async_trait]

View File

@ -0,0 +1,78 @@
use std::sync::Arc;
use async_trait::async_trait;
use serde::Serialize;
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::{
application::Application,
monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus,
prometheus::prometheus::PrometheusApplicationMonitoring,
},
score::Score,
topology::{Topology, oberservability::monitoring::AlertReceiver},
};
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationMonitoringScore {
pub sender: CRDPrometheus,
pub application: Arc<dyn Application>,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
}
impl<T: Topology + PrometheusApplicationMonitoring<CRDPrometheus>> Score<T>
for ApplicationMonitoringScore
{
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(ApplicationMonitoringInterpret {
score: self.clone(),
})
}
fn name(&self) -> String {
"ApplicationMonitoringScore".to_string()
}
}
#[derive(Debug)]
pub struct ApplicationMonitoringInterpret {
score: ApplicationMonitoringScore,
}
#[async_trait]
impl<T: Topology + PrometheusApplicationMonitoring<CRDPrometheus>> Interpret<T>
for ApplicationMonitoringInterpret
{
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
topology
.install_prometheus(
&self.score.sender,
inventory,
Some(self.score.receivers.clone()),
)
.await
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}

View File

@ -1,44 +0,0 @@
use std::sync::{Arc, Mutex};
use serde::Serialize;
use crate::{
modules::monitoring::{
kube_prometheus::types::ServiceMonitor,
prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig},
},
score::Score,
topology::{
HelmCommand, Topology,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
tenant::TenantManager,
},
};
#[derive(Clone, Debug, Serialize)]
pub struct ApplicationPrometheusMonitoringScore {
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
}
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
let mut prom_config = PrometheusConfig::new();
prom_config.alert_manager = true;
let config = Arc::new(Mutex::new(prom_config));
config
.try_lock()
.expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone();
Box::new(AlertingInterpret {
sender: Prometheus::new(),
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})
}
fn name(&self) -> String {
"ApplicationPrometheusMonitoringScore".to_string()
}
}

View File

@ -1 +1 @@
pub mod k8s_application_monitoring_score; pub mod application_monitoring_score;

View File

@ -0,0 +1,50 @@
use std::sync::Arc;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::topology::{
k8s::K8sClient,
oberservability::monitoring::{AlertReceiver, AlertSender},
};
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1alpha1",
kind = "AlertmanagerConfig",
plural = "alertmanagerconfigs",
namespaced
)]
pub struct AlertmanagerConfigSpec {
#[serde(flatten)]
pub data: serde_json::Value,
}
#[derive(Debug, Clone, Serialize)]
pub struct CRDPrometheus {
pub namespace: String,
pub client: Arc<K8sClient>,
}
impl AlertSender for CRDPrometheus {
fn name(&self) -> String {
"CRDAlertManager".to_string()
}
}
impl Clone for Box<dyn AlertReceiver<CRDPrometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
impl Serialize for Box<dyn AlertReceiver<CRDPrometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}

View File

@ -0,0 +1,53 @@
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use super::crd_prometheuses::LabelSelector;
/// Rust CRD for `Alertmanager` from Prometheus Operator
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "Alertmanager",
plural = "alertmanagers",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct AlertmanagerSpec {
/// Number of replicas for HA
pub replicas: i32,
/// Selectors for AlertmanagerConfig CRDs
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_namespace_selector: Option<LabelSelector>,
/// Optional pod template metadata (annotations, labels)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_metadata: Option<LabelSelector>,
/// Optional topology spread settings
#[serde(default, skip_serializing_if = "Option::is_none")]
pub version: Option<String>,
}
impl Default for AlertmanagerSpec {
fn default() -> Self {
AlertmanagerSpec {
replicas: 1,
// Match all AlertmanagerConfigs in the same namespace
alertmanager_config_namespace_selector: None,
// Empty selector matches all AlertmanagerConfigs in that namespace
alertmanager_config_selector: Some(LabelSelector::default()),
pod_metadata: None,
version: None,
}
}
}

View File

@ -0,0 +1,30 @@
use std::collections::BTreeMap;
use crate::modules::{
monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
prometheus::alerts::k8s::{
deployment::alert_deployment_unavailable,
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
pvc::high_pvc_fill_rate_over_two_days,
service::alert_service_down,
},
};
use super::crd_prometheus_rules::Rule;
pub fn build_default_application_rules() -> Vec<Rule> {
let pod_failed: Rule = pod_failed().into();
let container_restarting: Rule = alert_container_restarting().into();
let pod_not_ready: Rule = alert_pod_not_ready().into();
let service_down: Rule = alert_service_down().into();
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
vec![
pod_failed,
container_restarting,
pod_not_ready,
service_down,
deployment_unavailable,
high_pvc_fill_rate,
]
}

View File

@ -0,0 +1,153 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use super::crd_prometheuses::LabelSelector;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "Grafana",
plural = "grafanas",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub config: Option<GrafanaConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ingress: Option<GrafanaIngress>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub persistence: Option<GrafanaPersistence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resources: Option<ResourceRequirements>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub log: Option<GrafanaLogConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub security: Option<GrafanaSecurityConfig>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaLogConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub mode: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub level: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSecurityConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaIngress {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hosts: Option<Vec<String>>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaPersistence {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub storage_class_name: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub size: Option<String>,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDashboard",
plural = "grafanadashboards",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDashboardSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resync_period: Option<String>,
pub instance_selector: LabelSelector,
pub json: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDatasource",
plural = "grafanadatasources",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceSpec {
pub instance_selector: LabelSelector,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub allow_cross_namespace_import: Option<bool>,
pub datasource: GrafanaDatasourceConfig,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceConfig {
pub access: String,
pub database: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub json_data: Option<BTreeMap<String, String>>,
pub name: String,
pub r#type: String,
pub url: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct ResourceRequirements {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub limits: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub requests: BTreeMap<String, String>,
}

View File

@ -0,0 +1,59 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
use super::crd_default_rules::build_default_application_rules;
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "PrometheusRule",
plural = "prometheusrules",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusRuleSpec {
pub groups: Vec<RuleGroup>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct RuleGroup {
pub name: String,
pub rules: Vec<Rule>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct Rule {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alert: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expr: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub for_: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub labels: Option<std::collections::BTreeMap<String, String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub annotations: Option<std::collections::BTreeMap<String, String>>,
}
impl From<PrometheusAlertRule> for Rule {
fn from(value: PrometheusAlertRule) -> Self {
Rule {
alert: Some(value.alert),
expr: Some(value.expr),
for_: value.r#for,
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
}
}
}

View File

@ -0,0 +1,118 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::types::Operator;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "Prometheus",
plural = "prometheuses",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alerting: Option<PrometheusSpecAlerting>,
pub service_account_name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_namespace_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_discovery_role: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_namespace_selector: Option<LabelSelector>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct NamespaceSelector {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_names: Vec<String>,
}
/// Contains alerting configuration, specifically Alertmanager endpoints.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct PrometheusSpecAlerting {
#[serde(skip_serializing_if = "Option::is_none")]
pub alertmanagers: Option<Vec<AlertmanagerEndpoints>>,
}
/// Represents an Alertmanager endpoint configuration used by Prometheus.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct AlertmanagerEndpoints {
/// Name of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
/// Namespace of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub namespace: Option<String>,
/// Port to access on the Alertmanager Service (e.g. "web").
#[serde(skip_serializing_if = "Option::is_none")]
pub port: Option<String>,
/// Scheme to use for connecting (e.g. "http").
#[serde(skip_serializing_if = "Option::is_none")]
pub scheme: Option<String>,
// Other fields like `tls_config`, `path_prefix`, etc., can be added if needed.
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelector {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub match_labels: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_expressions: Vec<LabelSelectorRequirement>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelectorRequirement {
pub key: String,
pub operator: Operator,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub values: Vec<String>,
}
impl Default for PrometheusSpec {
fn default() -> Self {
PrometheusSpec {
alerting: None,
service_account_name: "prometheus".into(),
// null means "only my namespace"
service_monitor_namespace_selector: None,
// empty selector means match all ServiceMonitors in that namespace
service_monitor_selector: Some(LabelSelector::default()),
service_discovery_role: Some("Endpoints".into()),
pod_monitor_selector: None,
rule_selector: None,
rule_namespace_selector: Some(LabelSelector::default()),
}
}
}

View File

@ -0,0 +1,203 @@
pub fn build_default_dashboard(namespace: &str) -> String {
let dashboard = format!(
r#"{{
"annotations": {{
"list": []
}},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 171105,
"panels": [
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 0,
"y": 0
}},
"id": 1,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Namespace",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 6,
"y": 0
}},
"id": 2,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Failed State",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "percentunit"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 6
}},
"id": 3,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Deployment Health (Available / Desired)",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 12
}},
"id": 4,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
"legendFormat": "{{{{pod}}}}",
"refId": "A"
}}
],
"title": "Container Restarts (per pod)",
"type": "timeseries"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 18
}},
"id": 5,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Firing Alerts in Namespace",
"type": "stat"
}}
],
"schemaVersion": 36,
"templating": {{
"list": [
{{
"name": "datasource",
"type": "datasource",
"pluginId": "prometheus",
"label": "Prometheus",
"query": "prometheus",
"refresh": 1,
"hide": 0,
"current": {{
"selected": true,
"text": "Prometheus",
"value": "Prometheus"
}}
}}
]
}},
"title": "Tenant Namespace Overview",
"version": 1
}}"#
);
dashboard
}

View File

@ -0,0 +1,20 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn grafana_operator_helm_chart_score(ns: String) -> HelmChartScore {
HelmChartScore {
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("grafana_operator").unwrap(),
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana-operator")
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: None,
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -0,0 +1,11 @@
pub mod crd_alertmanager_config;
pub mod crd_alertmanagers;
pub mod crd_default_rules;
pub mod crd_grafana;
pub mod crd_prometheus_rules;
pub mod crd_prometheuses;
pub mod grafana_default_dashboard;
pub mod grafana_operator;
pub mod prometheus_operator;
pub mod role;
pub mod service_monitor;

View File

@ -0,0 +1,22 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn prometheus_operator_helm_chart_score(ns: String) -> HelmChartScore {
HelmChartScore {
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("prometheus-operator").unwrap(),
chart_name: NonBlankString::from_str(
"oci://hub.nationtech.io/harmony/nt-prometheus-operator",
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: None,
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -0,0 +1,62 @@
use k8s_openapi::api::{
core::v1::ServiceAccount,
rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject},
};
use kube::api::ObjectMeta;
pub fn build_prom_role(role_name: String, namespace: String) -> Role {
Role {
metadata: ObjectMeta {
name: Some(role_name),
namespace: Some(namespace),
..Default::default()
},
rules: Some(vec![PolicyRule {
api_groups: Some(vec!["".into()]), // core API group
resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]),
verbs: vec!["get".into(), "list".into(), "watch".into()],
..Default::default()
}]),
}
}
pub fn build_prom_rolebinding(
role_name: String,
namespace: String,
service_account_name: String,
) -> RoleBinding {
RoleBinding {
metadata: ObjectMeta {
name: Some(format!("{}-rolebinding", role_name)),
namespace: Some(namespace.clone()),
..Default::default()
},
role_ref: RoleRef {
api_group: "rbac.authorization.k8s.io".into(),
kind: "Role".into(),
name: role_name,
},
subjects: Some(vec![Subject {
kind: "ServiceAccount".into(),
name: service_account_name,
namespace: Some(namespace.clone()),
..Default::default()
}]),
}
}
pub fn build_prom_service_account(
service_account_name: String,
namespace: String,
) -> ServiceAccount {
ServiceAccount {
automount_service_account_token: None,
image_pull_secrets: None,
metadata: ObjectMeta {
name: Some(service_account_name),
namespace: Some(namespace),
..Default::default()
},
secrets: None,
}
}

View File

@ -0,0 +1,89 @@
use std::collections::{BTreeMap, HashMap};
use kube::{CustomResource, Resource, api::ObjectMeta};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::interpret::InterpretError;
use crate::modules::monitoring::kube_prometheus::types::{
HTTPScheme, MatchExpression, NamespaceSelector, Operator, Selector,
ServiceMonitor as KubeServiceMonitor, ServiceMonitorEndpoint,
};
/// This is the top-level struct for the ServiceMonitor Custom Resource.
/// The `#[derive(CustomResource)]` macro handles all the boilerplate for you,
/// including the `impl Resource`.
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "ServiceMonitor",
plural = "servicemonitors",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct ServiceMonitorSpec {
/// A label selector to select services to monitor.
pub selector: Selector,
/// A list of endpoints on the selected services to be monitored.
pub endpoints: Vec<ServiceMonitorEndpoint>,
/// Selector to select which namespaces the Kubernetes Endpoints objects
/// are discovered from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub namespace_selector: Option<NamespaceSelector>,
/// The label to use to retrieve the job name from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub job_label: Option<String>,
/// Pod-based target labels to transfer from the Kubernetes Pod onto the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pod_target_labels: Vec<String>,
/// TargetLabels transfers labels on the Kubernetes Service object to the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub target_labels: Vec<String>,
}
impl Default for ServiceMonitorSpec {
fn default() -> Self {
let mut labels = HashMap::new();
Self {
selector: Selector {
match_labels: { labels },
match_expressions: vec![MatchExpression {
key: "app.kubernetes.io/name".into(),
operator: Operator::Exists,
values: vec![],
}],
},
endpoints: vec![ServiceMonitorEndpoint {
port: Some("http".to_string()),
path: Some("/metrics".into()),
interval: Some("30s".into()),
scheme: Some(HTTPScheme::HTTP),
..Default::default()
}],
namespace_selector: None, // only the same namespace
job_label: Some("app".into()),
pod_target_labels: vec![],
target_labels: vec![],
}
}
}
impl From<KubeServiceMonitor> for ServiceMonitorSpec {
fn from(value: KubeServiceMonitor) -> Self {
Self {
selector: value.selector,
endpoints: value.endpoints,
namespace_selector: value.namespace_selector,
job_label: value.job_label,
pod_target_labels: value.pod_target_labels,
target_labels: value.target_labels,
}
}
}

View File

@ -35,7 +35,7 @@ impl KubePrometheusConfig {
windows_monitoring: false, windows_monitoring: false,
alert_manager: true, alert_manager: true,
grafana: true, grafana: true,
node_exporter: false, node_exporter: true,
prometheus: true, prometheus: true,
kubernetes_service_monitors: true, kubernetes_service_monitors: true,
kubernetes_api_server: true, kubernetes_api_server: true,

View File

@ -12,8 +12,8 @@ use crate::modules::{
helm::chart::HelmChartScore, helm::chart::HelmChartScore,
monitoring::kube_prometheus::types::{ monitoring::kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig, AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
AlertManagerRoute, AlertManagerSpec, AlertManagerValues, ConfigReloader, Limits, AlertManagerConfigSelector, AlertManagerRoute, AlertManagerSpec, AlertManagerValues,
PrometheusConfig, Requests, Resources, ConfigReloader, Limits, PrometheusConfig, Requests, Resources,
}, },
}; };
@ -332,6 +332,11 @@ prometheusOperator:
.push(receiver.channel_receiver.clone()); .push(receiver.channel_receiver.clone());
} }
let mut labels = BTreeMap::new();
labels.insert("alertmanagerConfig".to_string(), "enabled".to_string());
let alert_manager_config_selector = AlertManagerConfigSelector {
match_labels: labels,
};
let alert_manager_values = AlertManagerValues { let alert_manager_values = AlertManagerValues {
alertmanager: AlertManager { alertmanager: AlertManager {
enabled: config.alert_manager, enabled: config.alert_manager,
@ -347,6 +352,8 @@ prometheusOperator:
cpu: "100m".to_string(), cpu: "100m".to_string(),
}, },
}, },
alert_manager_config_selector,
replicas: 2,
}, },
init_config_reloader: ConfigReloader { init_config_reloader: ConfigReloader {
resources: Resources { resources: Resources {

View File

@ -1,3 +1,4 @@
pub mod crd;
pub mod helm; pub mod helm;
pub mod helm_prometheus_alert_score; pub mod helm_prometheus_alert_score;
pub mod prometheus; pub mod prometheus;

View File

@ -1,7 +1,8 @@
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use async_trait::async_trait; use async_trait::async_trait;
use serde::Serialize; use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_yaml::{Mapping, Sequence, Value}; use serde_yaml::{Mapping, Sequence, Value};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
@ -55,6 +56,14 @@ pub struct AlertManagerChannelConfig {
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct AlertManagerSpec { pub struct AlertManagerSpec {
pub(crate) resources: Resources, pub(crate) resources: Resources,
pub replicas: u32,
pub alert_manager_config_selector: AlertManagerConfigSelector,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct AlertManagerConfigSelector {
pub match_labels: BTreeMap<String, String>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
@ -86,7 +95,7 @@ pub struct AlertGroup {
pub groups: Vec<AlertManagerRuleGroup>, pub groups: Vec<AlertManagerRuleGroup>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub enum HTTPScheme { pub enum HTTPScheme {
#[serde(rename = "http")] #[serde(rename = "http")]
HTTP, HTTP,
@ -94,7 +103,7 @@ pub enum HTTPScheme {
HTTPS, HTTPS,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub enum Operator { pub enum Operator {
In, In,
NotIn, NotIn,
@ -139,74 +148,83 @@ pub struct ServiceMonitorTLSConfig {
pub server_name: Option<String>, pub server_name: Option<String>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ServiceMonitorEndpoint { pub struct ServiceMonitorEndpoint {
// ## Name of the endpoint's service port /// Name of the service port this endpoint refers to.
// ## Mutually exclusive with targetPort
pub port: Option<String>, pub port: Option<String>,
// ## Name or number of the endpoint's target port /// Interval at which metrics should be scraped.
// ## Mutually exclusive with port #[serde(default, skip_serializing_if = "Option::is_none")]
pub target_port: Option<String>,
// ## File containing bearer token to be used when scraping targets
// ##
pub bearer_token_file: Option<String>,
// ## Interval at which metrics should be scraped
// ##
pub interval: Option<String>, pub interval: Option<String>,
// ## HTTP path to scrape for metrics /// The HTTP path to scrape for metrics.
// ## #[serde(default, skip_serializing_if = "Option::is_none")]
pub path: String, pub path: Option<String>,
// ## HTTP scheme to use for scraping /// HTTP scheme to use for scraping.
// ## #[serde(default, skip_serializing_if = "Option::is_none")]
pub scheme: HTTPScheme, pub scheme: Option<HTTPScheme>,
// ## TLS configuration to use when scraping the endpoint /// Relabelings to apply to samples before scraping.
// ## #[serde(default, skip_serializing_if = "Vec::is_empty")]
pub tls_config: Option<ServiceMonitorTLSConfig>, pub relabelings: Vec<RelabelConfig>,
// ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. /// MetricRelabelings to apply to samples after scraping, but before ingestion.
// ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig #[serde(default, skip_serializing_if = "Vec::is_empty")]
// ## pub metric_relabelings: Vec<RelabelConfig>,
// # - action: keep
// # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
// # sourceLabels: [__name__]
pub metric_relabelings: Vec<Mapping>,
// ## RelabelConfigs to apply to samples before scraping
// ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
// ##
// # - sourceLabels: [__meta_kubernetes_pod_node_name]
// # separator: ;
// # regex: ^(.*)$
// # targetLabel: nodename
// # replacement: $1
// # action: replace
pub relabelings: Vec<Mapping>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct RelabelConfig {
/// The action to perform based on the regex matching.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub action: Option<String>,
/// A list of labels from which to extract values.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_labels: Vec<String>,
/// Separator to be used when concatenating source_labels.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub separator: Option<String>,
/// The label to which the resulting value is written.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub target_label: Option<String>,
/// A regular expression to match against the concatenated source label values.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub regex: Option<String>,
/// The replacement value to use.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub replacement: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct MatchExpression { pub struct MatchExpression {
pub key: String, pub key: String,
pub operator: Operator, pub operator: Operator, // "In", "NotIn", "Exists", "DoesNotExist"
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub values: Vec<String>, pub values: Vec<String>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct Selector { pub struct Selector {
// # label selector for services /// A map of key-value pairs to match.
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub match_labels: HashMap<String, String>, pub match_labels: HashMap<String, String>,
/// A list of label selector requirements.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_expressions: Vec<MatchExpression>, pub match_expressions: Vec<MatchExpression>,
} }
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ServiceMonitor { pub struct ServiceMonitor {
pub name: String, pub name: String,
@ -250,10 +268,15 @@ pub struct ServiceMonitor {
pub fallback_scrape_protocol: Option<String>, pub fallback_scrape_protocol: Option<String>,
} }
#[derive(Debug, Serialize, Clone)] #[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct NamespaceSelector { pub struct NamespaceSelector {
/// Select all namespaces.
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub any: bool, pub any: bool,
/// List of namespace names to select from.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_names: Vec<String>, pub match_names: Vec<String>,
} }
@ -275,19 +298,3 @@ impl Default for ServiceMonitor {
} }
} }
} }
impl Default for ServiceMonitorEndpoint {
fn default() -> Self {
Self {
port: Some("80".to_string()),
target_port: Default::default(),
bearer_token_file: Default::default(),
interval: Default::default(),
path: "/metrics".to_string(),
scheme: HTTPScheme::HTTP,
tls_config: Default::default(),
metric_relabelings: Default::default(),
relabelings: Default::default(),
}
}
}

View File

@ -58,6 +58,7 @@ config:
# web-root: "disable" # web-root: "disable"
enable-signup: false enable-signup: false
enable-login: "true" enable-login: "true"
enable-metrics: "true"
persistence: persistence:
enabled: true enabled: true

View File

@ -0,0 +1,23 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "DeploymentUnavailable".into(),
expr: "kube_deployment_status_replicas_unavailable > 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Deployment has unavailable replicas".into(),
),
(
"description".into(),
"A deployment in this namespace has unavailable replicas for over 2 minutes."
.into(),
),
]),
}
}

View File

@ -0,0 +1,37 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_high_memory_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighMemoryUsage".into(),
expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
.into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high memory".into()),
(
"description".into(),
"A pod is consuming more than 500Mi of memory.".into(),
),
]),
}
}
pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighCPUUsage".into(),
expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
.into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high CPU".into()),
(
"description".into(),
"A pod is using more than 90% of a core over 1 minute.".into(),
),
]),
}
}

View File

@ -1 +1,5 @@
pub mod deployment;
pub mod memory_usage;
pub mod pod;
pub mod pvc; pub mod pvc;
pub mod service;

View File

@ -0,0 +1,55 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn pod_failed() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodFailed".into(),
expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "A pod has failed".into()),
(
"description".into(),
"One or more pods are in Failed phase.".into(),
),
]),
}
}
pub fn alert_container_restarting() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ContainerRestarting".into(),
expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
r#for: Some("5m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Container is restarting frequently".into(),
),
(
"description".into(),
"A container in this namespace has restarted more than 3 times in 5 minutes."
.into(),
),
]),
}
}
pub fn alert_pod_not_ready() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodNotReady".into(),
expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is not ready".into()),
(
"description".into(),
"A pod in the namespace is not reporting Ready status.".into(),
),
]),
}
}

View File

@ -0,0 +1,19 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_service_down() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ServiceDown".into(),
expr: "up == 0".into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "Service is down".into()),
(
"description".into(),
"A target service in the namespace is not responding to Prometheus scrapes.".into(),
),
]),
}
}

View File

@ -0,0 +1,569 @@
use std::fs;
use std::{collections::BTreeMap, sync::Arc};
use tempfile::tempdir;
use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::{debug, info};
use serde::Serialize;
use std::process::Command;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
GrafanaDatasourceSpec, GrafanaSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
};
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
};
use crate::topology::oberservability::monitoring::AlertReceiver;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::monitoring::kube_prometheus::crd::{
crd_alertmanagers::{Alertmanager, AlertmanagerSpec},
crd_prometheuses::{
AlertmanagerEndpoints, LabelSelector, Prometheus, PrometheusSpec,
PrometheusSpecAlerting,
},
role::{build_prom_role, build_prom_rolebinding, build_prom_service_account},
},
score::Score,
};
use super::prometheus::PrometheusApplicationMonitoring;
#[derive(Clone, Debug, Serialize)]
pub struct K8sPrometheusCRDAlertingScore {
pub sender: CRDPrometheus,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<CRDPrometheus>> Score<T>
for K8sPrometheusCRDAlertingScore
{
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
Box::new(K8sPrometheusCRDAlertingInterpret {
sender: self.sender.clone(),
receivers: self.receivers.clone(),
service_monitors: self.service_monitors.clone(),
prometheus_rules: self.prometheus_rules.clone(),
})
}
fn name(&self) -> String {
"CRDApplicationAlertingScore".into()
}
}
#[derive(Clone, Debug)]
pub struct K8sPrometheusCRDAlertingInterpret {
pub sender: CRDPrometheus,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
#[async_trait]
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<CRDPrometheus>> Interpret<T>
for K8sPrometheusCRDAlertingInterpret
{
async fn execute(
&self,
_inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let client = topology.k8s_client().await.unwrap();
self.ensure_grafana_operator().await?;
self.install_prometheus(&client).await?;
self.install_alert_manager(&client).await?;
self.install_client_kube_metrics().await?;
self.install_grafana(&client).await?;
self.install_receivers(&self.sender, &self.receivers)
.await?;
self.install_rules(&self.prometheus_rules, &client).await?;
self.install_monitors(self.service_monitors.clone(), &client)
.await?;
Ok(Outcome::success(format!(
"deployed application monitoring composants"
)))
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
impl K8sPrometheusCRDAlertingInterpret {
async fn crd_exists(&self, crd: &str) -> bool {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i", crd])
.status()
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))
.unwrap();
status.success()
}
async fn install_chart(
&self,
chart_path: String,
chart_name: String,
) -> Result<(), InterpretError> {
let temp_dir =
tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?;
let temp_path = temp_dir.path().to_path_buf();
debug!("Using temp directory: {}", temp_path.display());
let chart = format!("{}/{}", chart_path, chart_name);
let pull_output = Command::new("helm")
.args(["pull", &chart, "--destination", temp_path.to_str().unwrap()])
.output()
.map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?;
if !pull_output.status.success() {
return Err(InterpretError::new(format!(
"Helm pull failed: {}",
String::from_utf8_lossy(&pull_output.stderr)
)));
}
let tgz_path = fs::read_dir(&temp_path)
.unwrap()
.filter_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.extension()? == "tgz" {
Some(path)
} else {
None
}
})
.next()
.ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?;
debug!("Installing chart from: {}", tgz_path.display());
let install_output = Command::new("helm")
.args([
"install",
&chart_name,
tgz_path.to_str().unwrap(),
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--wait",
"--atomic",
])
.output()
.map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?;
if !install_output.status.success() {
return Err(InterpretError::new(format!(
"Helm install failed: {}",
String::from_utf8_lossy(&install_output.stderr)
)));
}
debug!(
"Installed chart {}/{} in namespace: {}",
&chart_path,
&chart_name,
self.sender.namespace.clone()
);
Ok(())
}
async fn ensure_grafana_operator(&self) -> Result<Outcome, InterpretError> {
if self.crd_exists("grafanas.grafana.integreatly.org").await {
debug!("grafana CRDs already exist — skipping install.");
return Ok(Outcome::success("Grafana CRDs already exist".to_string()));
}
let _ = Command::new("helm")
.args([
"repo",
"add",
"grafana-operator",
"https://grafana.github.io/helm-charts",
])
.output()
.unwrap();
let _ = Command::new("helm")
.args(["repo", "update"])
.output()
.unwrap();
let output = Command::new("helm")
.args([
"install",
"grafana-operator",
"grafana-operator/grafana-operator",
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--set",
"namespaceScope=true",
])
.output()
.unwrap();
if !output.status.success() {
return Err(InterpretError::new(format!(
"helm install failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
)));
}
Ok(Outcome::success(format!(
"installed grafana operator in ns {}",
self.sender.namespace.clone()
)))
}
async fn install_prometheus(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
debug!(
"installing crd-prometheuses in namespace {}",
self.sender.namespace.clone()
);
debug!("building role/rolebinding/serviceaccount for crd-prometheus");
let rolename = format!("{}-prom", self.sender.namespace.clone());
let sa_name = format!("{}-prom-sa", self.sender.namespace.clone());
let role = build_prom_role(rolename.clone(), self.sender.namespace.clone());
let rolebinding = build_prom_rolebinding(
rolename.clone(),
self.sender.namespace.clone(),
sa_name.clone(),
);
let sa = build_prom_service_account(sa_name.clone(), self.sender.namespace.clone());
let prom_spec = PrometheusSpec {
alerting: Some(PrometheusSpecAlerting {
alertmanagers: Some(vec![AlertmanagerEndpoints {
name: Some("alertmanager-operated".into()),
namespace: Some(self.sender.namespace.clone()),
port: Some("web".into()),
scheme: Some("http".into()),
}]),
}),
service_account_name: sa_name.clone(),
service_monitor_namespace_selector: Some(LabelSelector {
match_labels: BTreeMap::from([(
"kubernetes.io/metadata.name".to_string(),
self.sender.namespace.clone(),
)]),
match_expressions: vec![],
}),
service_monitor_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
..Default::default()
}),
service_discovery_role: Some("Endpoints".into()),
pod_monitor_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
..Default::default()
}),
rule_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("role".to_string(), "prometheus-rule".to_string())]),
..Default::default()
}),
rule_namespace_selector: Some(LabelSelector {
match_labels: BTreeMap::from([(
"kubernetes.io/metadata.name".to_string(),
self.sender.namespace.clone(),
)]),
match_expressions: vec![],
}),
};
let prom = Prometheus {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("client".to_string(), "prometheus".to_string()),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: prom_spec,
};
client
.apply(&role, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus role: {:#?} in ns {:#?}",
role.metadata.name.unwrap(),
role.metadata.namespace.unwrap()
);
client
.apply(&rolebinding, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus rolebinding: {:#?} in ns {:#?}",
rolebinding.metadata.name.unwrap(),
rolebinding.metadata.namespace.unwrap()
);
client
.apply(&sa, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus service account: {:#?} in ns {:#?}",
sa.metadata.name.unwrap(),
sa.metadata.namespace.unwrap()
);
client
.apply(&prom, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus: {:#?} in ns {:#?}",
&prom.metadata.name.clone().unwrap(),
&prom.metadata.namespace.clone().unwrap()
);
Ok(Outcome::success(format!(
"successfully deployed crd-prometheus {:#?}",
prom
)))
}
async fn install_alert_manager(
&self,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let am = Alertmanager {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: AlertmanagerSpec::default(),
};
client
.apply(&am, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed service monitor {:#?}",
am.metadata.name
)))
}
async fn install_monitors(
&self,
mut monitors: Vec<ServiceMonitor>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let default_service_monitor = ServiceMonitor {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("client".to_string(), "prometheus".to_string()),
(
"app.kubernetes.io/name".to_string(),
"kube-state-metrics".to_string(),
),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: ServiceMonitorSpec::default(),
};
monitors.push(default_service_monitor);
for monitor in monitors.iter() {
client
.apply(monitor, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
}
Ok(Outcome::success(
"succesfully deployed service monitors".to_string(),
))
}
async fn install_rules(
&self,
rules: &Vec<RuleGroup>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let mut prom_rule_spec = PrometheusRuleSpec {
groups: rules.clone(),
};
let default_rules_group = RuleGroup {
name: format!("default-rules"),
rules: build_default_application_rules(),
};
prom_rule_spec.groups.push(default_rules_group);
let prom_rules = PrometheusRule {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("role".to_string(), "prometheus-rule".to_string()),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: prom_rule_spec,
};
client
.apply(&prom_rules, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed rules {:#?}",
prom_rules.metadata.name
)))
}
async fn install_client_kube_metrics(&self) -> Result<Outcome, InterpretError> {
self.install_chart(
"oci://hub.nationtech.io/harmony".to_string(),
"nt-kube-metrics".to_string(),
)
.await?;
Ok(Outcome::success(format!(
"Installed client kube metrics in ns {}",
&self.sender.namespace.clone()
)))
}
async fn install_grafana(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
let mut label = BTreeMap::new();
label.insert("dashboards".to_string(), "grafana".to_string());
let labels = LabelSelector {
match_labels: label.clone(),
match_expressions: vec![],
};
let mut json_data = BTreeMap::new();
json_data.insert("timeInterval".to_string(), "5s".to_string());
let namespace = self.sender.namespace.clone();
let json = build_default_dashboard(&namespace);
let graf_data_source = GrafanaDatasource {
metadata: ObjectMeta {
name: Some(format!(
"grafana-datasource-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDatasourceSpec {
instance_selector: labels.clone(),
allow_cross_namespace_import: Some(false),
datasource: GrafanaDatasourceConfig {
access: "proxy".to_string(),
database: Some("prometheus".to_string()),
json_data: Some(json_data),
//this is fragile
name: format!("prometheus-{}-0", self.sender.namespace.clone()),
r#type: "prometheus".to_string(),
url: format!(
"http://prometheus-operated.{}.svc.cluster.local:9090",
self.sender.namespace.clone()
),
},
},
};
client
.apply(&graf_data_source, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let graf_dashboard = GrafanaDashboard {
metadata: ObjectMeta {
name: Some(format!(
"grafana-dashboard-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDashboardSpec {
resync_period: Some("30s".to_string()),
instance_selector: labels.clone(),
json,
},
};
client
.apply(&graf_dashboard, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let grafana = Grafana {
metadata: ObjectMeta {
name: Some(format!("grafana-{}", self.sender.namespace.clone())),
namespace: Some(self.sender.namespace.clone()),
labels: Some(label.clone()),
..Default::default()
},
spec: GrafanaSpec {
config: None,
admin_user: None,
admin_password: None,
ingress: None,
persistence: None,
resources: None,
},
};
client
.apply(&grafana, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed grafana instance {:#?}",
grafana.metadata.name
)))
}
async fn install_receivers(
&self,
sender: &CRDPrometheus,
receivers: &Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
) -> Result<Outcome, InterpretError> {
for receiver in receivers.iter() {
receiver.install(sender).await.map_err(|err| {
InterpretError::new(format!("failed to install receiver: {}", err))
})?;
}
Ok(Outcome::success("successfully deployed receivers".into()))
}
}

View File

@ -1 +1,3 @@
pub mod alerts; pub mod alerts;
pub mod k8s_prometheus_alerting_score;
pub mod prometheus;

View File

@ -0,0 +1,17 @@
use async_trait::async_trait;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
topology::oberservability::monitoring::{AlertReceiver, AlertSender},
};
#[async_trait]
pub trait PrometheusApplicationMonitoring<S: AlertSender> {
async fn install_prometheus(
&self,
sender: &S,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<S>>>>,
) -> Result<Outcome, InterpretError>;
}