Compare commits

...

22 Commits

Author SHA1 Message Date
Ian Letourneau
5cc93d3107 use new harmony_cli::run
All checks were successful
Run Check Script / check (pull_request) Successful in -44s
2025-08-04 17:22:08 -04:00
Ian Letourneau
569839bf66 Merge branch 'master' into feat/crd-alertmanager-configs 2025-08-04 17:04:50 -04:00
Ian Letourneau
e078f5c062 quick cleanup
All checks were successful
Run Check Script / check (pull_request) Successful in -44s
2025-08-04 15:28:01 -04:00
Ian Letourneau
a8394cda47 simpler check to see if a crd exists + cleanup
All checks were successful
Run Check Script / check (pull_request) Successful in -41s
2025-08-02 11:58:39 -04:00
Ian Letourneau
064f6d88ba quick cleanup
All checks were successful
Run Check Script / check (pull_request) Successful in -42s
2025-08-02 11:10:50 -04:00
Ian Letourneau
9403581be5 fix check to ensure prometheus operator is installed 2025-08-02 11:10:20 -04:00
Ian Letourneau
056152a1e5 remove comment
All checks were successful
Run Check Script / check (pull_request) Successful in -40s
2025-08-01 23:52:26 -04:00
Ian Letourneau
c6b255d0bd merge configure_receiver with AlertReceiver::install & cleanup unused stuff
All checks were successful
Run Check Script / check (pull_request) Successful in -39s
2025-08-01 23:09:12 -04:00
Ian Letourneau
4b6bebcaf1 remove unnecessary configure_receivers method from trait 2025-08-01 18:26:05 -04:00
Ian Letourneau
961a300154 cleanup unused k3d prometheus monitoring score & simplify design 2025-08-01 17:59:18 -04:00
a5deda647b wip: need to convert the generic type AlertReceiver<CRDPrometheus> to CRDAlertManagerReceiver in k8sAnywhereTopology which extends AlertReceiver<CRDPrometheus> in order to be able to configure and install the receiver and its associated crd-alertmanagerconfigs to the cluster 2025-07-31 16:17:30 -04:00
0b965b6570 Merge remote-tracking branch 'origin/master' into feat/crd-alertmanager-configs
All checks were successful
Run Check Script / check (pull_request) Successful in -37s
2025-07-28 15:22:24 -04:00
d7bce37b69 fix: cargo fmt
All checks were successful
Run Check Script / check (pull_request) Successful in -37s
2025-07-28 15:18:46 -04:00
b56a30de3c fix: prometheus operator and grafana operator deploy application namespace on local k3d
Some checks failed
Run Check Script / check (pull_request) Failing after -1m5s
if kube-prometheus-operator is present installation of prometheus-operator will skip
outside of local k3d installation installation of operator is skipped
2025-07-28 15:15:10 -04:00
b9e208f4cf feat: added default prometheus rules and grafana dashboard for application monitoring
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
2025-07-22 13:26:03 -04:00
1d8b503bd2 Xwip: uses a helm chart to deploy a prometheus operator if crd are ont present in cluster, and deploys a grafana operator.
All checks were successful
Run Check Script / check (pull_request) Successful in -32s
added a sample dashboard and prometheus data source to grafana
2025-07-21 17:59:35 -04:00
114219385f wip:added impl for prometheuses, alertmanagers, prometheusrules, servicemonitors, and some default rules that are deployed for application monitor
All checks were successful
Run Check Script / check (pull_request) Successful in 2m19s
working on implementing grafana crds via grafana operator
need to link prometheus rules and alert managers in prometheus, testing it shows that prometheus isnt detecting them automatically
2025-07-16 15:56:00 -04:00
1525ac2226 fix: git conflict
All checks were successful
Run Check Script / check (pull_request) Successful in -19s
2025-07-14 14:34:53 -04:00
55a4e79ec4 fix: added updated Cargo
All checks were successful
Run Check Script / check (pull_request) Successful in 1m52s
2025-07-14 14:18:32 -04:00
7b91088828 feat: added impl for webhook receiver for crd alertmanagerconfigs
Some checks failed
Run Check Script / check (pull_request) Failing after 49s
2025-07-14 13:41:48 -04:00
e61ec015ab feat: added impl for Discordwebhook receiver to receive application alerts from namespaces from application feature
Some checks failed
Run Check Script / check (pull_request) Failing after 49s
2025-07-14 13:06:47 -04:00
819f4a32fd wip: added an implementation of CRDalertmanagerconfigs that can be used to add a discord webhook receiver, currently the namespace is hard coded and there are a bunch of todos!() that need to be cleaned up, and flags need to be added so that alertmanager will automatically register the crd 2025-07-11 16:01:52 -04:00
43 changed files with 2292 additions and 177 deletions

86
Cargo.lock generated
View File

@ -96,6 +96,12 @@ dependencies = [
"libc",
]
[[package]]
name = "ansi_term"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b3568b48b7cefa6b8ce125f9bb4989e52fbcc29ebea88df04cc7c5f12f70455"
[[package]]
name = "anstream"
version = "0.6.19"
@ -1259,6 +1265,18 @@ dependencies = [
name = "example"
version = "0.0.0"
[[package]]
name = "example-application-monitoring-with-tenant"
version = "0.1.0"
dependencies = [
"env_logger",
"harmony",
"harmony_cli",
"logging",
"tokio",
"url",
]
[[package]]
name = "example-cli"
version = "0.1.0"
@ -1779,6 +1797,7 @@ dependencies = [
"k3d-rs",
"k8s-openapi",
"kube",
"kube-derive",
"lazy_static",
"libredfish",
"log",
@ -1791,6 +1810,7 @@ dependencies = [
"reqwest 0.11.27",
"russh",
"rust-ipmi",
"schemars 0.8.22",
"semver",
"serde",
"serde-value",
@ -2669,6 +2689,7 @@ dependencies = [
"k8s-openapi",
"kube-client",
"kube-core",
"kube-derive",
"kube-runtime",
]
@ -2722,12 +2743,27 @@ dependencies = [
"http 1.3.1",
"json-patch",
"k8s-openapi",
"schemars 0.8.22",
"serde",
"serde-value",
"serde_json",
"thiserror 2.0.12",
]
[[package]]
name = "kube-derive"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "079fc8c1c397538628309cfdee20696ebdcc26745f9fb17f89b78782205bd995"
dependencies = [
"darling",
"proc-macro2",
"quote",
"serde",
"serde_json",
"syn",
]
[[package]]
name = "kube-runtime"
version = "1.1.0"
@ -2843,6 +2879,15 @@ dependencies = [
"log",
]
[[package]]
name = "logging"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "461a8beca676e8ab1bd468c92e9b4436d6368e11e96ae038209e520cfe665e46"
dependencies = [
"ansi_term",
]
[[package]]
name = "lru"
version = "0.12.5"
@ -4140,6 +4185,18 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "schemars"
version = "0.8.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
dependencies = [
"dyn-clone",
"schemars_derive",
"serde",
"serde_json",
]
[[package]]
name = "schemars"
version = "0.9.0"
@ -4154,9 +4211,9 @@ dependencies = [
[[package]]
name = "schemars"
version = "1.0.3"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1375ba8ef45a6f15d83fa8748f1079428295d403d6ea991d09ab100155fbc06d"
checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0"
dependencies = [
"dyn-clone",
"ref-cast",
@ -4164,6 +4221,18 @@ dependencies = [
"serde_json",
]
[[package]]
name = "schemars_derive"
version = "0.8.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d"
dependencies = [
"proc-macro2",
"quote",
"serde_derive_internals",
"syn",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@ -4296,6 +4365,17 @@ dependencies = [
"syn",
]
[[package]]
name = "serde_derive_internals"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.140"
@ -4374,7 +4454,7 @@ dependencies = [
"indexmap 1.9.3",
"indexmap 2.10.0",
"schemars 0.9.0",
"schemars 1.0.3",
"schemars 1.0.4",
"serde",
"serde_derive",
"serde_json",

View File

@ -0,0 +1,14 @@
[package]
name = "example-application-monitoring-with-tenant"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
env_logger.workspace = true
harmony = { version = "0.1.0", path = "../../harmony" }
harmony_cli = { version = "0.1.0", path = "../../harmony_cli" }
logging = "0.1.0"
tokio.workspace = true
url.workspace = true

View File

@ -0,0 +1,61 @@
use std::{path::PathBuf, sync::Arc};
use harmony::{
data::Id,
inventory::Inventory,
maestro::Maestro,
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, Monitoring},
},
monitoring::alert_channel::{
discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
},
tenant::TenantScore,
},
topology::{K8sAnywhereTopology, Url, tenant::TenantConfig},
};
#[tokio::main]
async fn main() {
//TODO there is a bug where the application is deployed into the namespace matching the
//application name and the tenant is created in the namesapce matching the tenant name
//in order for the application to be deployed in the tenant namespace the application.name and
//the TenantConfig.name must match
let tenant = TenantScore {
config: TenantConfig {
id: Id::from_str("test-tenant-id"),
name: "example-monitoring".to_string(),
..Default::default()
},
};
let application = Arc::new(RustWebapp {
name: "example-monitoring".to_string(),
domain: Url::Url(url::Url::parse("https://rustapp.harmony.example.com").unwrap()),
project_root: PathBuf::from("./examples/rust/webapp"),
framework: Some(RustWebFramework::Leptos),
});
let webhook_receiver = WebhookReceiver {
name: "sample-webhook-receiver".to_string(),
url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
};
let app = ApplicationScore {
features: vec![Box::new(Monitoring {
alert_receiver: vec![Box::new(webhook_receiver)],
application: application.clone(),
})],
application,
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(tenant), Box::new(app)],
None,
)
.await
.unwrap();
}

View File

@ -50,8 +50,8 @@ async fn main() {
let service_monitor_endpoint = ServiceMonitorEndpoint {
port: Some("80".to_string()),
path: "/metrics".to_string(),
scheme: HTTPScheme::HTTP,
path: Some("/metrics".to_string()),
scheme: Some(HTTPScheme::HTTP),
..Default::default()
};

View File

@ -53,8 +53,8 @@ async fn main() {
let service_monitor_endpoint = ServiceMonitorEndpoint {
port: Some("80".to_string()),
path: "/metrics".to_string(),
scheme: HTTPScheme::HTTP,
path: Some("/metrics".to_string()),
scheme: Some(HTTPScheme::HTTP),
..Default::default()
};

View File

@ -2,10 +2,15 @@ use std::{path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,
modules::application::{
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, Monitoring},
},
monitoring::alert_channel::{
discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
},
},
topology::{K8sAnywhereTopology, Url},
};
@ -18,6 +23,16 @@ async fn main() {
framework: Some(RustWebFramework::Leptos),
});
let discord_receiver = DiscordWebhook {
name: "test-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
};
let webhook_receiver = WebhookReceiver {
name: "sample-webhook-receiver".to_string(),
url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
};
let app = ApplicationScore {
features: vec![
Box::new(ContinuousDelivery {
@ -25,7 +40,9 @@ async fn main() {
}),
Box::new(Monitoring {
application: application.clone(),
}), // TODO: add backups, multisite ha, etc.
alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
}),
// TODO add backups, multisite ha, etc
],
application,
};

View File

@ -27,7 +27,7 @@ harmony_macros = { path = "../harmony_macros" }
harmony_types = { path = "../harmony_types" }
uuid.workspace = true
url.workspace = true
kube.workspace = true
kube = { workspace = true, features = ["derive"] }
k8s-openapi.workspace = true
serde_yaml.workspace = true
http.workspace = true
@ -58,6 +58,8 @@ tokio-util = "0.7.15"
strum = { version = "0.27.1", features = ["derive"] }
tempfile = "3.20.0"
serde_with = "3.14.0"
schemars = "0.8.22"
kube-derive = "1.1.0"
bollard.workspace = true
tar.workspace = true
base64.workspace = true

View File

@ -17,7 +17,7 @@ use kube::{
runtime::wait::await_condition,
};
use log::{debug, error, trace};
use serde::de::DeserializeOwned;
use serde::{Serialize, de::DeserializeOwned};
use similar::{DiffableStr, TextDiff};
#[derive(new, Clone)]
@ -25,6 +25,15 @@ pub struct K8sClient {
client: Client,
}
impl Serialize for K8sClient {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl std::fmt::Debug for K8sClient {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// This is a poor man's debug implementation for now as kube::Client does not provide much

View File

@ -7,22 +7,33 @@ use tokio::sync::OnceCell;
use crate::{
executors::ExecutorError,
interpret::{InterpretError, Outcome},
interpret::{InterpretError, InterpretStatus, Outcome},
inventory::Inventory,
modules::k3d::K3DInstallationScore,
modules::{
k3d::K3DInstallationScore,
monitoring::kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus,
prometheus_operator::prometheus_operator_helm_chart_score,
},
prometheus::{
k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore,
prometheus::PrometheusApplicationMonitoring,
},
},
score::Score,
};
use super::{
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, Topology,
k8s::K8sClient,
oberservability::monitoring::AlertReceiver,
tenant::{TenantConfig, TenantManager, k8s::K8sTenantManager},
};
#[derive(Clone, Debug)]
struct K8sState {
client: Arc<K8sClient>,
_source: K8sSource,
source: K8sSource,
message: String,
}
@ -56,8 +67,32 @@ impl K8sclient for K8sAnywhereTopology {
}
}
#[async_trait]
impl PrometheusApplicationMonitoring<CRDPrometheus> for K8sAnywhereTopology {
async fn install_prometheus(
&self,
sender: &CRDPrometheus,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
) -> Result<Outcome, InterpretError> {
let po_result = self.ensure_prometheus_operator(sender).await?;
if po_result.status == InterpretStatus::NOOP {
debug!("Skipping Prometheus CR installation due to missing operator.");
return Ok(Outcome::noop());
}
self.get_k8s_prometheus_application_score(sender.clone(), receivers)
.await
.create_interpret()
.execute(inventory, self)
.await?;
Ok(Outcome::success(format!("No action, working on cluster ")))
}
}
impl Serialize for K8sAnywhereTopology {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
@ -82,6 +117,19 @@ impl K8sAnywhereTopology {
}
}
async fn get_k8s_prometheus_application_score(
&self,
sender: CRDPrometheus,
receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
) -> K8sPrometheusCRDAlertingScore {
K8sPrometheusCRDAlertingScore {
sender,
receivers: receivers.unwrap_or_else(Vec::new),
service_monitors: vec![],
prometheus_rules: vec![],
}
}
fn is_helm_available(&self) -> Result<(), String> {
let version_result = Command::new("helm")
.arg("version")
@ -132,7 +180,7 @@ impl K8sAnywhereTopology {
Some(client) => {
return Ok(Some(K8sState {
client: Arc::new(client),
_source: K8sSource::Kubeconfig,
source: K8sSource::Kubeconfig,
message: format!("Loaded k8s client from kubeconfig {kubeconfig}"),
}));
}
@ -174,7 +222,7 @@ impl K8sAnywhereTopology {
let state = match k3d.get_client().await {
Ok(client) => K8sState {
client: Arc::new(K8sClient::new(client)),
_source: K8sSource::LocalK3d,
source: K8sSource::LocalK3d,
message: "K8s client ready".to_string(),
},
Err(_) => todo!(),
@ -190,6 +238,7 @@ impl K8sAnywhereTopology {
self.tenant_manager
.get_or_try_init(async || -> Result<K8sTenantManager, String> {
// TOOD: checker si K8s ou K3d/s tenant manager (ref. issue https://git.nationtech.io/NationTech/harmony/issues/94)
let k8s_client = self.k8s_client().await?;
Ok(K8sTenantManager::new(k8s_client))
})
@ -206,6 +255,48 @@ impl K8sAnywhereTopology {
)),
}
}
async fn ensure_prometheus_operator(
&self,
sender: &CRDPrometheus,
) -> Result<Outcome, InterpretError> {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i prometheuses"])
.status()
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))?;
if !status.success() {
if let Some(Some(k8s_state)) = self.k8s_state.get() {
match k8s_state.source {
K8sSource::LocalK3d => {
debug!("installing prometheus operator");
let op_score =
prometheus_operator_helm_chart_score(sender.namespace.clone());
op_score
.create_interpret()
.execute(&Inventory::empty(), self)
.await?;
return Ok(Outcome::success(
"installed prometheus operator".to_string(),
));
}
K8sSource::Kubeconfig => {
debug!("unable to install prometheus operator, contact cluster admin");
return Ok(Outcome::noop());
}
}
} else {
warn!("Unable to detect k8s_state. Skipping Prometheus Operator install.");
return Ok(Outcome::noop());
}
}
debug!("Prometheus operator is already present, skipping install");
Ok(Outcome::success(
"prometheus operator present in cluster".to_string(),
))
}
}
#[derive(Clone, Debug)]

View File

@ -1,3 +1,5 @@
use std::any::Any;
use async_trait::async_trait;
use log::debug;
@ -62,7 +64,9 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
#[async_trait]
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
async fn install(&self, sender: &S) -> Result<Outcome, InterpretError>;
fn name(&self) -> String;
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
fn as_any(&self) -> &dyn Any;
}
#[async_trait]
@ -72,6 +76,6 @@ pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
}
#[async_trait]
pub trait ScrapeTarger<S: AlertSender> {
pub trait ScrapeTarget<S: AlertSender> {
async fn install(&self, sender: &S) -> Result<(), InterpretError>;
}

View File

@ -231,8 +231,13 @@ impl K8sTenantManager {
{
"to": [
{
//TODO this ip is from the docker network that k3d is running on
//since k3d does not deploy kube-api-server as a pod it needs to ahve the ip
//address opened up
//need to find a way to automatically detect the ip address from the docker
//network
"ipBlock": {
"cidr": "172.23.0.0/16",
"cidr": "172.24.0.0/16",
}
}
]

View File

@ -1,45 +1,60 @@
use std::sync::Arc;
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use log::{debug, info};
use crate::modules::application::{Application, ApplicationFeature};
use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
use crate::{
inventory::Inventory,
modules::{
application::{ApplicationFeature, OCICompliant},
monitoring::{
alert_channel::webhook_receiver::WebhookReceiver,
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{NamespaceSelector, ServiceMonitor},
},
ntfy::ntfy::NtfyScore,
},
modules::monitoring::{
alert_channel::webhook_receiver::WebhookReceiver, ntfy::ntfy::NtfyScore,
},
score::Score,
topology::{HelmCommand, K8sclient, Topology, Url, tenant::TenantManager},
};
use crate::{
modules::prometheus::prometheus::PrometheusApplicationMonitoring,
topology::oberservability::monitoring::AlertReceiver,
};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use log::{debug, info};
#[derive(Debug, Clone)]
pub struct Monitoring {
pub application: Arc<dyn OCICompliant>,
pub application: Arc<dyn Application>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
}
#[async_trait]
impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> ApplicationFeature<T>
for Monitoring
impl<
T: Topology
+ HelmCommand
+ 'static
+ TenantManager
+ K8sclient
+ std::fmt::Debug
+ PrometheusApplicationMonitoring<CRDPrometheus>,
> ApplicationFeature<T> for Monitoring
{
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application");
let namespace = topology
.get_tenant_config()
.await
.map(|ns| ns.name.clone())
.unwrap_or_else(|| self.application.name());
let mut alerting_score = ApplicationMonitoringScore {
sender: CRDPrometheus {
namespace: namespace.clone(),
client: topology.k8s_client().await.unwrap(),
},
application: self.application.clone(),
receivers: self.alert_receiver.clone(),
};
let ntfy = NtfyScore {
// namespace: topology
// .get_tenant_config()
// .await
// .expect("couldn't get tenant config")
// .name,
namespace: self.application.name(),
namespace: namespace.clone(),
host: "localhost".to_string(),
};
ntfy.create_interpret()
@ -70,7 +85,7 @@ impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> Applicatio
url::Url::parse(
format!(
"http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}",
self.application.name()
namespace.clone()
)
.as_str(),
)
@ -78,26 +93,7 @@ impl<T: Topology + HelmCommand + K8sclient + 'static + TenantManager> Applicatio
),
};
let mut service_monitor = ServiceMonitor::default();
service_monitor.namespace_selector = Some(NamespaceSelector {
any: true,
match_names: vec![],
});
service_monitor.name = "rust-webapp".to_string();
// let alerting_score = ApplicationPrometheusMonitoringScore {
// receivers: vec![Box::new(ntfy_receiver)],
// rules: vec![],
// service_monitors: vec![service_monitor],
// };
let alerting_score = HelmPrometheusAlertingScore {
receivers: vec![Box::new(ntfy_receiver)],
rules: vec![],
service_monitors: vec![service_monitor],
};
alerting_score.receivers.push(Box::new(ntfy_receiver));
alerting_score
.create_interpret()
.execute(&Inventory::empty(), topology)

View File

@ -10,6 +10,7 @@ pub use oci::*;
pub use rust::*;
use async_trait::async_trait;
use serde::Serialize;
use crate::{
data::{Id, Version},
@ -78,3 +79,12 @@ impl<A: Application, T: Topology + std::fmt::Debug> Interpret<T> for Application
todo!()
}
}
impl Serialize for dyn Application {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}

View File

@ -1,7 +1,16 @@
use std::any::Any;
use std::collections::BTreeMap;
use async_trait::async_trait;
use k8s_openapi::api::core::v1::Secret;
use kube::api::ObjectMeta;
use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value};
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
};
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::{
@ -20,14 +29,98 @@ pub struct DiscordWebhook {
pub url: Url,
}
#[async_trait]
impl AlertReceiver<CRDPrometheus> for DiscordWebhook {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let ns = sender.namespace.clone();
let secret_name = format!("{}-secret", self.name.clone());
let webhook_key = format!("{}", self.url.clone());
let mut string_data = BTreeMap::new();
string_data.insert("webhook-url".to_string(), webhook_key.clone());
let secret = Secret {
metadata: kube::core::ObjectMeta {
name: Some(secret_name.clone()),
..Default::default()
},
string_data: Some(string_data),
type_: Some("Opaque".to_string()),
..Default::default()
};
let _ = sender.client.apply(&secret, Some(&ns)).await;
let spec = AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"discordConfigs": [
{
"apiURL": {
"name": secret_name,
"key": "webhook-url",
},
"title": "{{ template \"discord.default.title\" . }}",
"message": "{{ template \"discord.default.message\" . }}"
}
]
}
]
}),
};
let alertmanager_configs = AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(ns),
..Default::default()
},
spec,
};
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<Prometheus> for DiscordWebhook {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
@ -48,6 +141,12 @@ impl AlertReceiver<KubePrometheus> for DiscordWebhook {
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
fn name(&self) -> String {
"discord-webhook".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]

View File

@ -1,11 +1,19 @@
use std::any::Any;
use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::debug;
use serde::Serialize;
use serde_json::json;
use serde_yaml::{Mapping, Value};
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::{
kube_prometheus::{
crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
},
prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
},
@ -20,14 +28,81 @@ pub struct WebhookReceiver {
pub url: Url,
}
#[async_trait]
impl AlertReceiver<CRDPrometheus> for WebhookReceiver {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let spec = AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
}
]
}
]
}),
};
let alertmanager_configs = AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed crd-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<CRDPrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<Prometheus> for WebhookReceiver {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
@ -44,9 +119,15 @@ impl AlertReceiver<KubePrometheus> for WebhookReceiver {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]

View File

@ -0,0 +1,78 @@
use std::sync::Arc;
use async_trait::async_trait;
use serde::Serialize;
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::{
application::Application,
monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus,
prometheus::prometheus::PrometheusApplicationMonitoring,
},
score::Score,
topology::{Topology, oberservability::monitoring::AlertReceiver},
};
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationMonitoringScore {
pub sender: CRDPrometheus,
pub application: Arc<dyn Application>,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
}
impl<T: Topology + PrometheusApplicationMonitoring<CRDPrometheus>> Score<T>
for ApplicationMonitoringScore
{
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(ApplicationMonitoringInterpret {
score: self.clone(),
})
}
fn name(&self) -> String {
"ApplicationMonitoringScore".to_string()
}
}
#[derive(Debug)]
pub struct ApplicationMonitoringInterpret {
score: ApplicationMonitoringScore,
}
#[async_trait]
impl<T: Topology + PrometheusApplicationMonitoring<CRDPrometheus>> Interpret<T>
for ApplicationMonitoringInterpret
{
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
topology
.install_prometheus(
&self.score.sender,
inventory,
Some(self.score.receivers.clone()),
)
.await
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}

View File

@ -1,44 +0,0 @@
use std::sync::{Arc, Mutex};
use serde::Serialize;
use crate::{
modules::monitoring::{
kube_prometheus::types::ServiceMonitor,
prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig},
},
score::Score,
topology::{
HelmCommand, Topology,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
tenant::TenantManager,
},
};
#[derive(Clone, Debug, Serialize)]
pub struct ApplicationPrometheusMonitoringScore {
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
}
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
let mut prom_config = PrometheusConfig::new();
prom_config.alert_manager = true;
let config = Arc::new(Mutex::new(prom_config));
config
.try_lock()
.expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone();
Box::new(AlertingInterpret {
sender: Prometheus::new(),
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})
}
fn name(&self) -> String {
"ApplicationPrometheusMonitoringScore".to_string()
}
}

View File

@ -1 +1 @@
pub mod k8s_application_monitoring_score;
pub mod application_monitoring_score;

View File

@ -0,0 +1,50 @@
use std::sync::Arc;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::topology::{
k8s::K8sClient,
oberservability::monitoring::{AlertReceiver, AlertSender},
};
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1alpha1",
kind = "AlertmanagerConfig",
plural = "alertmanagerconfigs",
namespaced
)]
pub struct AlertmanagerConfigSpec {
#[serde(flatten)]
pub data: serde_json::Value,
}
#[derive(Debug, Clone, Serialize)]
pub struct CRDPrometheus {
pub namespace: String,
pub client: Arc<K8sClient>,
}
impl AlertSender for CRDPrometheus {
fn name(&self) -> String {
"CRDAlertManager".to_string()
}
}
impl Clone for Box<dyn AlertReceiver<CRDPrometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
impl Serialize for Box<dyn AlertReceiver<CRDPrometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}

View File

@ -0,0 +1,53 @@
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use super::crd_prometheuses::LabelSelector;
/// Rust CRD for `Alertmanager` from Prometheus Operator
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "Alertmanager",
plural = "alertmanagers",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct AlertmanagerSpec {
/// Number of replicas for HA
pub replicas: i32,
/// Selectors for AlertmanagerConfig CRDs
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_namespace_selector: Option<LabelSelector>,
/// Optional pod template metadata (annotations, labels)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_metadata: Option<LabelSelector>,
/// Optional topology spread settings
#[serde(default, skip_serializing_if = "Option::is_none")]
pub version: Option<String>,
}
impl Default for AlertmanagerSpec {
fn default() -> Self {
AlertmanagerSpec {
replicas: 1,
// Match all AlertmanagerConfigs in the same namespace
alertmanager_config_namespace_selector: None,
// Empty selector matches all AlertmanagerConfigs in that namespace
alertmanager_config_selector: Some(LabelSelector::default()),
pod_metadata: None,
version: None,
}
}
}

View File

@ -0,0 +1,30 @@
use std::collections::BTreeMap;
use crate::modules::{
monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule,
prometheus::alerts::k8s::{
deployment::alert_deployment_unavailable,
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
pvc::high_pvc_fill_rate_over_two_days,
service::alert_service_down,
},
};
use super::crd_prometheus_rules::Rule;
pub fn build_default_application_rules() -> Vec<Rule> {
let pod_failed: Rule = pod_failed().into();
let container_restarting: Rule = alert_container_restarting().into();
let pod_not_ready: Rule = alert_pod_not_ready().into();
let service_down: Rule = alert_service_down().into();
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
vec![
pod_failed,
container_restarting,
pod_not_ready,
service_down,
deployment_unavailable,
high_pvc_fill_rate,
]
}

View File

@ -0,0 +1,153 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use super::crd_prometheuses::LabelSelector;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "Grafana",
plural = "grafanas",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub config: Option<GrafanaConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ingress: Option<GrafanaIngress>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub persistence: Option<GrafanaPersistence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resources: Option<ResourceRequirements>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub log: Option<GrafanaLogConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub security: Option<GrafanaSecurityConfig>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaLogConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub mode: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub level: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSecurityConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaIngress {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hosts: Option<Vec<String>>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaPersistence {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub storage_class_name: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub size: Option<String>,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDashboard",
plural = "grafanadashboards",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDashboardSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resync_period: Option<String>,
pub instance_selector: LabelSelector,
pub json: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDatasource",
plural = "grafanadatasources",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceSpec {
pub instance_selector: LabelSelector,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub allow_cross_namespace_import: Option<bool>,
pub datasource: GrafanaDatasourceConfig,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceConfig {
pub access: String,
pub database: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub json_data: Option<BTreeMap<String, String>>,
pub name: String,
pub r#type: String,
pub url: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct ResourceRequirements {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub limits: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub requests: BTreeMap<String, String>,
}

View File

@ -0,0 +1,59 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
use super::crd_default_rules::build_default_application_rules;
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "PrometheusRule",
plural = "prometheusrules",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusRuleSpec {
pub groups: Vec<RuleGroup>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct RuleGroup {
pub name: String,
pub rules: Vec<Rule>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct Rule {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alert: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expr: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub for_: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub labels: Option<std::collections::BTreeMap<String, String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub annotations: Option<std::collections::BTreeMap<String, String>>,
}
impl From<PrometheusAlertRule> for Rule {
fn from(value: PrometheusAlertRule) -> Self {
Rule {
alert: Some(value.alert),
expr: Some(value.expr),
for_: value.r#for,
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
}
}
}

View File

@ -0,0 +1,118 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::types::Operator;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "Prometheus",
plural = "prometheuses",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alerting: Option<PrometheusSpecAlerting>,
pub service_account_name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_namespace_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_discovery_role: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_namespace_selector: Option<LabelSelector>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct NamespaceSelector {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_names: Vec<String>,
}
/// Contains alerting configuration, specifically Alertmanager endpoints.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct PrometheusSpecAlerting {
#[serde(skip_serializing_if = "Option::is_none")]
pub alertmanagers: Option<Vec<AlertmanagerEndpoints>>,
}
/// Represents an Alertmanager endpoint configuration used by Prometheus.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct AlertmanagerEndpoints {
/// Name of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
/// Namespace of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub namespace: Option<String>,
/// Port to access on the Alertmanager Service (e.g. "web").
#[serde(skip_serializing_if = "Option::is_none")]
pub port: Option<String>,
/// Scheme to use for connecting (e.g. "http").
#[serde(skip_serializing_if = "Option::is_none")]
pub scheme: Option<String>,
// Other fields like `tls_config`, `path_prefix`, etc., can be added if needed.
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelector {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub match_labels: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_expressions: Vec<LabelSelectorRequirement>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelectorRequirement {
pub key: String,
pub operator: Operator,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub values: Vec<String>,
}
impl Default for PrometheusSpec {
fn default() -> Self {
PrometheusSpec {
alerting: None,
service_account_name: "prometheus".into(),
// null means "only my namespace"
service_monitor_namespace_selector: None,
// empty selector means match all ServiceMonitors in that namespace
service_monitor_selector: Some(LabelSelector::default()),
service_discovery_role: Some("Endpoints".into()),
pod_monitor_selector: None,
rule_selector: None,
rule_namespace_selector: Some(LabelSelector::default()),
}
}
}

View File

@ -0,0 +1,203 @@
pub fn build_default_dashboard(namespace: &str) -> String {
let dashboard = format!(
r#"{{
"annotations": {{
"list": []
}},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 171105,
"panels": [
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 0,
"y": 0
}},
"id": 1,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{namespace=\"{namespace}\", phase=\"Running\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Namespace",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 6,
"x": 6,
"y": 0
}},
"id": 2,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_pod_status_phase{{phase=\"Failed\", namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Pods in Failed State",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "percentunit"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 6
}},
"id": 3,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(kube_deployment_status_replicas_available{{namespace=\"{namespace}\"}}) / sum(kube_deployment_spec_replicas{{namespace=\"{namespace}\"}})",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Deployment Health (Available / Desired)",
"type": "stat"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 12
}},
"id": 4,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum by(pod) (rate(kube_pod_container_status_restarts_total{{namespace=\"{namespace}\"}}[5m]))",
"legendFormat": "{{{{pod}}}}",
"refId": "A"
}}
],
"title": "Container Restarts (per pod)",
"type": "timeseries"
}},
{{
"datasource": "$datasource",
"fieldConfig": {{
"defaults": {{
"unit": "short"
}},
"overrides": []
}},
"gridPos": {{
"h": 6,
"w": 12,
"x": 0,
"y": 18
}},
"id": 5,
"options": {{
"reduceOptions": {{
"calcs": ["lastNotNull"],
"fields": "",
"values": false
}}
}},
"pluginVersion": "9.0.0",
"targets": [
{{
"expr": "sum(ALERTS{{alertstate=\"firing\", namespace=\"{namespace}\"}}) or vector(0)",
"legendFormat": "",
"refId": "A"
}}
],
"title": "Firing Alerts in Namespace",
"type": "stat"
}}
],
"schemaVersion": 36,
"templating": {{
"list": [
{{
"name": "datasource",
"type": "datasource",
"pluginId": "prometheus",
"label": "Prometheus",
"query": "prometheus",
"refresh": 1,
"hide": 0,
"current": {{
"selected": true,
"text": "Prometheus",
"value": "Prometheus"
}}
}}
]
}},
"title": "Tenant Namespace Overview",
"version": 1
}}"#
);
dashboard
}

View File

@ -0,0 +1,20 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn grafana_operator_helm_chart_score(ns: String) -> HelmChartScore {
HelmChartScore {
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("grafana_operator").unwrap(),
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana-operator")
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: None,
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -0,0 +1,11 @@
pub mod crd_alertmanager_config;
pub mod crd_alertmanagers;
pub mod crd_default_rules;
pub mod crd_grafana;
pub mod crd_prometheus_rules;
pub mod crd_prometheuses;
pub mod grafana_default_dashboard;
pub mod grafana_operator;
pub mod prometheus_operator;
pub mod role;
pub mod service_monitor;

View File

@ -0,0 +1,22 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn prometheus_operator_helm_chart_score(ns: String) -> HelmChartScore {
HelmChartScore {
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("prometheus-operator").unwrap(),
chart_name: NonBlankString::from_str(
"oci://hub.nationtech.io/harmony/nt-prometheus-operator",
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: None,
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -0,0 +1,62 @@
use k8s_openapi::api::{
core::v1::ServiceAccount,
rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject},
};
use kube::api::ObjectMeta;
pub fn build_prom_role(role_name: String, namespace: String) -> Role {
Role {
metadata: ObjectMeta {
name: Some(role_name),
namespace: Some(namespace),
..Default::default()
},
rules: Some(vec![PolicyRule {
api_groups: Some(vec!["".into()]), // core API group
resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]),
verbs: vec!["get".into(), "list".into(), "watch".into()],
..Default::default()
}]),
}
}
pub fn build_prom_rolebinding(
role_name: String,
namespace: String,
service_account_name: String,
) -> RoleBinding {
RoleBinding {
metadata: ObjectMeta {
name: Some(format!("{}-rolebinding", role_name)),
namespace: Some(namespace.clone()),
..Default::default()
},
role_ref: RoleRef {
api_group: "rbac.authorization.k8s.io".into(),
kind: "Role".into(),
name: role_name,
},
subjects: Some(vec![Subject {
kind: "ServiceAccount".into(),
name: service_account_name,
namespace: Some(namespace.clone()),
..Default::default()
}]),
}
}
pub fn build_prom_service_account(
service_account_name: String,
namespace: String,
) -> ServiceAccount {
ServiceAccount {
automount_service_account_token: None,
image_pull_secrets: None,
metadata: ObjectMeta {
name: Some(service_account_name),
namespace: Some(namespace),
..Default::default()
},
secrets: None,
}
}

View File

@ -0,0 +1,89 @@
use std::collections::{BTreeMap, HashMap};
use kube::{CustomResource, Resource, api::ObjectMeta};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::interpret::InterpretError;
use crate::modules::monitoring::kube_prometheus::types::{
HTTPScheme, MatchExpression, NamespaceSelector, Operator, Selector,
ServiceMonitor as KubeServiceMonitor, ServiceMonitorEndpoint,
};
/// This is the top-level struct for the ServiceMonitor Custom Resource.
/// The `#[derive(CustomResource)]` macro handles all the boilerplate for you,
/// including the `impl Resource`.
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.coreos.com",
version = "v1",
kind = "ServiceMonitor",
plural = "servicemonitors",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct ServiceMonitorSpec {
/// A label selector to select services to monitor.
pub selector: Selector,
/// A list of endpoints on the selected services to be monitored.
pub endpoints: Vec<ServiceMonitorEndpoint>,
/// Selector to select which namespaces the Kubernetes Endpoints objects
/// are discovered from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub namespace_selector: Option<NamespaceSelector>,
/// The label to use to retrieve the job name from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub job_label: Option<String>,
/// Pod-based target labels to transfer from the Kubernetes Pod onto the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pod_target_labels: Vec<String>,
/// TargetLabels transfers labels on the Kubernetes Service object to the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub target_labels: Vec<String>,
}
impl Default for ServiceMonitorSpec {
fn default() -> Self {
let mut labels = HashMap::new();
Self {
selector: Selector {
match_labels: { labels },
match_expressions: vec![MatchExpression {
key: "app.kubernetes.io/name".into(),
operator: Operator::Exists,
values: vec![],
}],
},
endpoints: vec![ServiceMonitorEndpoint {
port: Some("http".to_string()),
path: Some("/metrics".into()),
interval: Some("30s".into()),
scheme: Some(HTTPScheme::HTTP),
..Default::default()
}],
namespace_selector: None, // only the same namespace
job_label: Some("app".into()),
pod_target_labels: vec![],
target_labels: vec![],
}
}
}
impl From<KubeServiceMonitor> for ServiceMonitorSpec {
fn from(value: KubeServiceMonitor) -> Self {
Self {
selector: value.selector,
endpoints: value.endpoints,
namespace_selector: value.namespace_selector,
job_label: value.job_label,
pod_target_labels: value.pod_target_labels,
target_labels: value.target_labels,
}
}
}

View File

@ -35,7 +35,7 @@ impl KubePrometheusConfig {
windows_monitoring: false,
alert_manager: true,
grafana: true,
node_exporter: false,
node_exporter: true,
prometheus: true,
kubernetes_service_monitors: true,
kubernetes_api_server: true,

View File

@ -12,8 +12,8 @@ use crate::modules::{
helm::chart::HelmChartScore,
monitoring::kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
AlertManagerRoute, AlertManagerSpec, AlertManagerValues, ConfigReloader, Limits,
PrometheusConfig, Requests, Resources,
AlertManagerConfigSelector, AlertManagerRoute, AlertManagerSpec, AlertManagerValues,
ConfigReloader, Limits, PrometheusConfig, Requests, Resources,
},
};
@ -332,6 +332,11 @@ prometheusOperator:
.push(receiver.channel_receiver.clone());
}
let mut labels = BTreeMap::new();
labels.insert("alertmanagerConfig".to_string(), "enabled".to_string());
let alert_manager_config_selector = AlertManagerConfigSelector {
match_labels: labels,
};
let alert_manager_values = AlertManagerValues {
alertmanager: AlertManager {
enabled: config.alert_manager,
@ -347,6 +352,8 @@ prometheusOperator:
cpu: "100m".to_string(),
},
},
alert_manager_config_selector,
replicas: 2,
},
init_config_reloader: ConfigReloader {
resources: Resources {

View File

@ -1,3 +1,4 @@
pub mod crd;
pub mod helm;
pub mod helm_prometheus_alert_score;
pub mod prometheus;

View File

@ -1,7 +1,8 @@
use std::collections::{BTreeMap, HashMap};
use async_trait::async_trait;
use serde::Serialize;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_yaml::{Mapping, Sequence, Value};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
@ -55,6 +56,14 @@ pub struct AlertManagerChannelConfig {
#[serde(rename_all = "camelCase")]
pub struct AlertManagerSpec {
pub(crate) resources: Resources,
pub replicas: u32,
pub alert_manager_config_selector: AlertManagerConfigSelector,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct AlertManagerConfigSelector {
pub match_labels: BTreeMap<String, String>,
}
#[derive(Debug, Clone, Serialize)]
@ -86,7 +95,7 @@ pub struct AlertGroup {
pub groups: Vec<AlertManagerRuleGroup>,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub enum HTTPScheme {
#[serde(rename = "http")]
HTTP,
@ -94,7 +103,7 @@ pub enum HTTPScheme {
HTTPS,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub enum Operator {
In,
NotIn,
@ -139,74 +148,83 @@ pub struct ServiceMonitorTLSConfig {
pub server_name: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct ServiceMonitorEndpoint {
// ## Name of the endpoint's service port
// ## Mutually exclusive with targetPort
/// Name of the service port this endpoint refers to.
pub port: Option<String>,
// ## Name or number of the endpoint's target port
// ## Mutually exclusive with port
pub target_port: Option<String>,
// ## File containing bearer token to be used when scraping targets
// ##
pub bearer_token_file: Option<String>,
// ## Interval at which metrics should be scraped
// ##
/// Interval at which metrics should be scraped.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub interval: Option<String>,
// ## HTTP path to scrape for metrics
// ##
pub path: String,
/// The HTTP path to scrape for metrics.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub path: Option<String>,
// ## HTTP scheme to use for scraping
// ##
pub scheme: HTTPScheme,
/// HTTP scheme to use for scraping.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub scheme: Option<HTTPScheme>,
// ## TLS configuration to use when scraping the endpoint
// ##
pub tls_config: Option<ServiceMonitorTLSConfig>,
/// Relabelings to apply to samples before scraping.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub relabelings: Vec<RelabelConfig>,
// ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion.
// ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
// ##
// # - action: keep
// # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
// # sourceLabels: [__name__]
pub metric_relabelings: Vec<Mapping>,
// ## RelabelConfigs to apply to samples before scraping
// ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
// ##
// # - sourceLabels: [__meta_kubernetes_pod_node_name]
// # separator: ;
// # regex: ^(.*)$
// # targetLabel: nodename
// # replacement: $1
// # action: replace
pub relabelings: Vec<Mapping>,
/// MetricRelabelings to apply to samples after scraping, but before ingestion.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub metric_relabelings: Vec<RelabelConfig>,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct RelabelConfig {
/// The action to perform based on the regex matching.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub action: Option<String>,
/// A list of labels from which to extract values.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub source_labels: Vec<String>,
/// Separator to be used when concatenating source_labels.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub separator: Option<String>,
/// The label to which the resulting value is written.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub target_label: Option<String>,
/// A regular expression to match against the concatenated source label values.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub regex: Option<String>,
/// The replacement value to use.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub replacement: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct MatchExpression {
pub key: String,
pub operator: Operator,
pub operator: Operator, // "In", "NotIn", "Exists", "DoesNotExist"
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub values: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct Selector {
// # label selector for services
/// A map of key-value pairs to match.
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub match_labels: HashMap<String, String>,
/// A list of label selector requirements.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_expressions: Vec<MatchExpression>,
}
#[derive(Debug, Clone, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ServiceMonitor {
pub name: String,
@ -250,10 +268,15 @@ pub struct ServiceMonitor {
pub fallback_scrape_protocol: Option<String>,
}
#[derive(Debug, Serialize, Clone)]
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct NamespaceSelector {
/// Select all namespaces.
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub any: bool,
/// List of namespace names to select from.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_names: Vec<String>,
}
@ -275,19 +298,3 @@ impl Default for ServiceMonitor {
}
}
}
impl Default for ServiceMonitorEndpoint {
fn default() -> Self {
Self {
port: Some("80".to_string()),
target_port: Default::default(),
bearer_token_file: Default::default(),
interval: Default::default(),
path: "/metrics".to_string(),
scheme: HTTPScheme::HTTP,
tls_config: Default::default(),
metric_relabelings: Default::default(),
relabelings: Default::default(),
}
}
}

View File

@ -58,6 +58,7 @@ config:
# web-root: "disable"
enable-signup: false
enable-login: "true"
enable-metrics: "true"
persistence:
enabled: true

View File

@ -0,0 +1,23 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_deployment_unavailable() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "DeploymentUnavailable".into(),
expr: "kube_deployment_status_replicas_unavailable > 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Deployment has unavailable replicas".into(),
),
(
"description".into(),
"A deployment in this namespace has unavailable replicas for over 2 minutes."
.into(),
),
]),
}
}

View File

@ -0,0 +1,37 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_high_memory_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighMemoryUsage".into(),
expr: "container_memory_working_set_bytes{container!=\"\",namespace!=\"\"} > 500000000"
.into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high memory".into()),
(
"description".into(),
"A pod is consuming more than 500Mi of memory.".into(),
),
]),
}
}
pub fn alert_high_cpu_usage() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "HighCPUUsage".into(),
expr: "rate(container_cpu_usage_seconds_total{container!=\"\",namespace!=\"\"}[1m]) > 0.9"
.into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is using high CPU".into()),
(
"description".into(),
"A pod is using more than 90% of a core over 1 minute.".into(),
),
]),
}
}

View File

@ -1 +1,5 @@
pub mod deployment;
pub mod memory_usage;
pub mod pod;
pub mod pvc;
pub mod service;

View File

@ -0,0 +1,55 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn pod_failed() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodFailed".into(),
expr: "kube_pod_status_phase{phase=\"Failed\"} > 2".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "A pod has failed".into()),
(
"description".into(),
"One or more pods are in Failed phase.".into(),
),
]),
}
}
pub fn alert_container_restarting() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ContainerRestarting".into(),
expr: "increase(kube_pod_container_status_restarts_total[5m]) > 3".into(),
r#for: Some("5m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
(
"summary".into(),
"Container is restarting frequently".into(),
),
(
"description".into(),
"A container in this namespace has restarted more than 3 times in 5 minutes."
.into(),
),
]),
}
}
pub fn alert_pod_not_ready() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "PodNotReady".into(),
expr: "kube_pod_status_ready{condition=\"true\"} == 0".into(),
r#for: Some("2m".into()),
labels: HashMap::from([("severity".into(), "warning".into())]),
annotations: HashMap::from([
("summary".into(), "Pod is not ready".into()),
(
"description".into(),
"A pod in the namespace is not reporting Ready status.".into(),
),
]),
}
}

View File

@ -0,0 +1,19 @@
use std::collections::HashMap;
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn alert_service_down() -> PrometheusAlertRule {
PrometheusAlertRule {
alert: "ServiceDown".into(),
expr: "up == 0".into(),
r#for: Some("1m".into()),
labels: HashMap::from([("severity".into(), "critical".into())]),
annotations: HashMap::from([
("summary".into(), "Service is down".into()),
(
"description".into(),
"A target service in the namespace is not responding to Prometheus scrapes.".into(),
),
]),
}
}

View File

@ -0,0 +1,569 @@
use std::fs;
use std::{collections::BTreeMap, sync::Arc};
use tempfile::tempdir;
use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::{debug, info};
use serde::Serialize;
use std::process::Command;
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
GrafanaDatasourceSpec, GrafanaSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
};
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
};
use crate::topology::oberservability::monitoring::AlertReceiver;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::monitoring::kube_prometheus::crd::{
crd_alertmanagers::{Alertmanager, AlertmanagerSpec},
crd_prometheuses::{
AlertmanagerEndpoints, LabelSelector, Prometheus, PrometheusSpec,
PrometheusSpecAlerting,
},
role::{build_prom_role, build_prom_rolebinding, build_prom_service_account},
},
score::Score,
};
use super::prometheus::PrometheusApplicationMonitoring;
#[derive(Clone, Debug, Serialize)]
pub struct K8sPrometheusCRDAlertingScore {
pub sender: CRDPrometheus,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<CRDPrometheus>> Score<T>
for K8sPrometheusCRDAlertingScore
{
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
Box::new(K8sPrometheusCRDAlertingInterpret {
sender: self.sender.clone(),
receivers: self.receivers.clone(),
service_monitors: self.service_monitors.clone(),
prometheus_rules: self.prometheus_rules.clone(),
})
}
fn name(&self) -> String {
"CRDApplicationAlertingScore".into()
}
}
#[derive(Clone, Debug)]
pub struct K8sPrometheusCRDAlertingInterpret {
pub sender: CRDPrometheus,
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
#[async_trait]
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<CRDPrometheus>> Interpret<T>
for K8sPrometheusCRDAlertingInterpret
{
async fn execute(
&self,
_inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let client = topology.k8s_client().await.unwrap();
self.ensure_grafana_operator().await?;
self.install_prometheus(&client).await?;
self.install_alert_manager(&client).await?;
self.install_client_kube_metrics().await?;
self.install_grafana(&client).await?;
self.install_receivers(&self.sender, &self.receivers)
.await?;
self.install_rules(&self.prometheus_rules, &client).await?;
self.install_monitors(self.service_monitors.clone(), &client)
.await?;
Ok(Outcome::success(format!(
"deployed application monitoring composants"
)))
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
impl K8sPrometheusCRDAlertingInterpret {
async fn crd_exists(&self, crd: &str) -> bool {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i", crd])
.status()
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))
.unwrap();
status.success()
}
async fn install_chart(
&self,
chart_path: String,
chart_name: String,
) -> Result<(), InterpretError> {
let temp_dir =
tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?;
let temp_path = temp_dir.path().to_path_buf();
debug!("Using temp directory: {}", temp_path.display());
let chart = format!("{}/{}", chart_path, chart_name);
let pull_output = Command::new("helm")
.args(["pull", &chart, "--destination", temp_path.to_str().unwrap()])
.output()
.map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?;
if !pull_output.status.success() {
return Err(InterpretError::new(format!(
"Helm pull failed: {}",
String::from_utf8_lossy(&pull_output.stderr)
)));
}
let tgz_path = fs::read_dir(&temp_path)
.unwrap()
.filter_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.extension()? == "tgz" {
Some(path)
} else {
None
}
})
.next()
.ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?;
debug!("Installing chart from: {}", tgz_path.display());
let install_output = Command::new("helm")
.args([
"install",
&chart_name,
tgz_path.to_str().unwrap(),
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--wait",
"--atomic",
])
.output()
.map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?;
if !install_output.status.success() {
return Err(InterpretError::new(format!(
"Helm install failed: {}",
String::from_utf8_lossy(&install_output.stderr)
)));
}
debug!(
"Installed chart {}/{} in namespace: {}",
&chart_path,
&chart_name,
self.sender.namespace.clone()
);
Ok(())
}
async fn ensure_grafana_operator(&self) -> Result<Outcome, InterpretError> {
if self.crd_exists("grafanas.grafana.integreatly.org").await {
debug!("grafana CRDs already exist — skipping install.");
return Ok(Outcome::success("Grafana CRDs already exist".to_string()));
}
let _ = Command::new("helm")
.args([
"repo",
"add",
"grafana-operator",
"https://grafana.github.io/helm-charts",
])
.output()
.unwrap();
let _ = Command::new("helm")
.args(["repo", "update"])
.output()
.unwrap();
let output = Command::new("helm")
.args([
"install",
"grafana-operator",
"grafana-operator/grafana-operator",
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--set",
"namespaceScope=true",
])
.output()
.unwrap();
if !output.status.success() {
return Err(InterpretError::new(format!(
"helm install failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
)));
}
Ok(Outcome::success(format!(
"installed grafana operator in ns {}",
self.sender.namespace.clone()
)))
}
async fn install_prometheus(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
debug!(
"installing crd-prometheuses in namespace {}",
self.sender.namespace.clone()
);
debug!("building role/rolebinding/serviceaccount for crd-prometheus");
let rolename = format!("{}-prom", self.sender.namespace.clone());
let sa_name = format!("{}-prom-sa", self.sender.namespace.clone());
let role = build_prom_role(rolename.clone(), self.sender.namespace.clone());
let rolebinding = build_prom_rolebinding(
rolename.clone(),
self.sender.namespace.clone(),
sa_name.clone(),
);
let sa = build_prom_service_account(sa_name.clone(), self.sender.namespace.clone());
let prom_spec = PrometheusSpec {
alerting: Some(PrometheusSpecAlerting {
alertmanagers: Some(vec![AlertmanagerEndpoints {
name: Some("alertmanager-operated".into()),
namespace: Some(self.sender.namespace.clone()),
port: Some("web".into()),
scheme: Some("http".into()),
}]),
}),
service_account_name: sa_name.clone(),
service_monitor_namespace_selector: Some(LabelSelector {
match_labels: BTreeMap::from([(
"kubernetes.io/metadata.name".to_string(),
self.sender.namespace.clone(),
)]),
match_expressions: vec![],
}),
service_monitor_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
..Default::default()
}),
service_discovery_role: Some("Endpoints".into()),
pod_monitor_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
..Default::default()
}),
rule_selector: Some(LabelSelector {
match_labels: BTreeMap::from([("role".to_string(), "prometheus-rule".to_string())]),
..Default::default()
}),
rule_namespace_selector: Some(LabelSelector {
match_labels: BTreeMap::from([(
"kubernetes.io/metadata.name".to_string(),
self.sender.namespace.clone(),
)]),
match_expressions: vec![],
}),
};
let prom = Prometheus {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("client".to_string(), "prometheus".to_string()),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: prom_spec,
};
client
.apply(&role, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus role: {:#?} in ns {:#?}",
role.metadata.name.unwrap(),
role.metadata.namespace.unwrap()
);
client
.apply(&rolebinding, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus rolebinding: {:#?} in ns {:#?}",
rolebinding.metadata.name.unwrap(),
rolebinding.metadata.namespace.unwrap()
);
client
.apply(&sa, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus service account: {:#?} in ns {:#?}",
sa.metadata.name.unwrap(),
sa.metadata.namespace.unwrap()
);
client
.apply(&prom, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!(
"installed prometheus: {:#?} in ns {:#?}",
&prom.metadata.name.clone().unwrap(),
&prom.metadata.namespace.clone().unwrap()
);
Ok(Outcome::success(format!(
"successfully deployed crd-prometheus {:#?}",
prom
)))
}
async fn install_alert_manager(
&self,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let am = Alertmanager {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: AlertmanagerSpec::default(),
};
client
.apply(&am, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed service monitor {:#?}",
am.metadata.name
)))
}
async fn install_monitors(
&self,
mut monitors: Vec<ServiceMonitor>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let default_service_monitor = ServiceMonitor {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("client".to_string(), "prometheus".to_string()),
(
"app.kubernetes.io/name".to_string(),
"kube-state-metrics".to_string(),
),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: ServiceMonitorSpec::default(),
};
monitors.push(default_service_monitor);
for monitor in monitors.iter() {
client
.apply(monitor, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
}
Ok(Outcome::success(
"succesfully deployed service monitors".to_string(),
))
}
async fn install_rules(
&self,
rules: &Vec<RuleGroup>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let mut prom_rule_spec = PrometheusRuleSpec {
groups: rules.clone(),
};
let default_rules_group = RuleGroup {
name: format!("default-rules"),
rules: build_default_application_rules(),
};
prom_rule_spec.groups.push(default_rules_group);
let prom_rules = PrometheusRule {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("role".to_string(), "prometheus-rule".to_string()),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: prom_rule_spec,
};
client
.apply(&prom_rules, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed rules {:#?}",
prom_rules.metadata.name
)))
}
async fn install_client_kube_metrics(&self) -> Result<Outcome, InterpretError> {
self.install_chart(
"oci://hub.nationtech.io/harmony".to_string(),
"nt-kube-metrics".to_string(),
)
.await?;
Ok(Outcome::success(format!(
"Installed client kube metrics in ns {}",
&self.sender.namespace.clone()
)))
}
async fn install_grafana(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
let mut label = BTreeMap::new();
label.insert("dashboards".to_string(), "grafana".to_string());
let labels = LabelSelector {
match_labels: label.clone(),
match_expressions: vec![],
};
let mut json_data = BTreeMap::new();
json_data.insert("timeInterval".to_string(), "5s".to_string());
let namespace = self.sender.namespace.clone();
let json = build_default_dashboard(&namespace);
let graf_data_source = GrafanaDatasource {
metadata: ObjectMeta {
name: Some(format!(
"grafana-datasource-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDatasourceSpec {
instance_selector: labels.clone(),
allow_cross_namespace_import: Some(false),
datasource: GrafanaDatasourceConfig {
access: "proxy".to_string(),
database: Some("prometheus".to_string()),
json_data: Some(json_data),
//this is fragile
name: format!("prometheus-{}-0", self.sender.namespace.clone()),
r#type: "prometheus".to_string(),
url: format!(
"http://prometheus-operated.{}.svc.cluster.local:9090",
self.sender.namespace.clone()
),
},
},
};
client
.apply(&graf_data_source, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let graf_dashboard = GrafanaDashboard {
metadata: ObjectMeta {
name: Some(format!(
"grafana-dashboard-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDashboardSpec {
resync_period: Some("30s".to_string()),
instance_selector: labels.clone(),
json,
},
};
client
.apply(&graf_dashboard, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let grafana = Grafana {
metadata: ObjectMeta {
name: Some(format!("grafana-{}", self.sender.namespace.clone())),
namespace: Some(self.sender.namespace.clone()),
labels: Some(label.clone()),
..Default::default()
},
spec: GrafanaSpec {
config: None,
admin_user: None,
admin_password: None,
ingress: None,
persistence: None,
resources: None,
},
};
client
.apply(&grafana, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed grafana instance {:#?}",
grafana.metadata.name
)))
}
async fn install_receivers(
&self,
sender: &CRDPrometheus,
receivers: &Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
) -> Result<Outcome, InterpretError> {
for receiver in receivers.iter() {
receiver.install(sender).await.map_err(|err| {
InterpretError::new(format!("failed to install receiver: {}", err))
})?;
}
Ok(Outcome::success("successfully deployed receivers".into()))
}
}

View File

@ -1 +1,3 @@
pub mod alerts;
pub mod k8s_prometheus_alerting_score;
pub mod prometheus;

View File

@ -0,0 +1,17 @@
use async_trait::async_trait;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
topology::oberservability::monitoring::{AlertReceiver, AlertSender},
};
#[async_trait]
pub trait PrometheusApplicationMonitoring<S: AlertSender> {
async fn install_prometheus(
&self,
sender: &S,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<S>>>>,
) -> Result<Outcome, InterpretError>;
}