monitoring-alerting #30

Merged
wjro merged 12 commits from monitoring-alerting into master 2025-05-06 17:50:57 +00:00
6 changed files with 306 additions and 0 deletions

View File

@ -1,3 +1,4 @@
pub mod monitoring_alerting;
mod ha_cluster;
mod host_binding;
mod http;

View File

@ -0,0 +1,108 @@
use std::sync::Arc;
use log::warn;
use tokio::sync::OnceCell;
use k8s_openapi::api::core::v1::Pod;
use kube::{
Client,
api::{Api, ListParams},
};
use async_trait::async_trait;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
maestro::Maestro,
modules::monitoring::monitoring_alerting::MonitoringAlertingStackScore,
score::Score,
};
use super::{HelmCommand, K8sAnywhereTopology, Topology, k8s::K8sClient};
#[derive(Clone, Debug)]
struct MonitoringState {
message: String,
}
#[derive(Debug)]
pub struct MonitoringAlertingTopology {
monitoring_state: OnceCell<Option<MonitoringState>>,
}
impl MonitoringAlertingTopology {
pub fn new() -> Self {
Self {
monitoring_state: OnceCell::new(),
}
}
async fn get_monitoring_state(&self) -> Result<Option<MonitoringState>, InterpretError> {
let client = Client::try_default()
.await
.map_err(|e| InterpretError::new(format!("Kubernetes client error: {}", e)))?;
for ns in &["monitoring", "openshift-monitoring"] {
let pods: Api<Pod> = Api::namespaced(client.clone(), ns);
//TODO hardcoding the label is a problem
//check all pods are ready
let lp = ListParams::default().labels("app.kubernetes.io/name=prometheus");
match pods.list(&lp).await {
Ok(pod_list) => {
for p in pod_list.items {
if let Some(status) = p.status {
if let Some(conditions) = status.conditions {
if conditions
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
{
return Ok(Some(MonitoringState {
message: format!(
"Prometheus is ready in namespace: {}",
ns
),
}));
}
}
}
}
}
Err(e) => {
warn!("Failed to query pods in ns {}: {}", ns, e);
}
}
}
Ok(None)
}
}
impl<T: Topology> Clone for Box<dyn Score<T>> {
fn clone(&self) -> Box<dyn Score<T>> {
self.clone_box()
}
}
#[async_trait]
impl Topology for MonitoringAlertingTopology {
fn name(&self) -> &str {
"MonitoringAlertingTopology"
}
async fn ensure_ready(&self) -> Result<Outcome, InterpretError> {
if let Some(state) = self.get_monitoring_state().await? {
// Monitoring stack is already ready — stop app.
println!("{}", state.message);
std::process::exit(0);
}
// Monitoring not found — proceed with installation.
Ok(Outcome::success(
"Monitoring stack installation started.".to_string(),
))
}
}
impl HelmCommand for MonitoringAlertingTopology {}

View File

@ -10,3 +10,4 @@ pub mod load_balancer;
pub mod okd;
pub mod opnsense;
pub mod tftp;
pub mod monitoring;

View File

@ -0,0 +1,49 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn kube_prometheus_score(ns: &str) -> HelmChartScore {
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
//to the overrides or something leaving the user to deal with formatting here seems bad
let values = r#"
additionalPrometheusRulesMap:
pvc-alerts:
groups:
- name: pvc-alerts
rules:
- alert: 'PVC Fill Over 95 Percent In 2 Days'
expr: |
(
kubelet_volume_stats_used_bytes
/
kubelet_volume_stats_capacity_bytes
) > 0.95
AND
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
/
kubelet_volume_stats_capacity_bytes
> 0.95
for: 1m
labels:
severity: warning
annotations:
description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.
title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days
"#;
HelmChartScore {
namespace: Some(NonBlankString::from_str(ns).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", //use kube prometheus chart which includes grafana, prometheus, alert
//manager, etc
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: Some(values.to_string()),
create_namespace: true,
install_only: true,
}
}

View File

@ -0,0 +1,3 @@
pub mod monitoring_alerting;
mod kube_prometheus;

View File

@ -0,0 +1,144 @@
use async_trait::async_trait;
use log::info;
use serde::Serialize;
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
maestro::Maestro,
score::{CloneBoxScore, Score},
topology::{HelmCommand, Topology, monitoring_alerting::MonitoringAlertingTopology},
};
use super::kube_prometheus::kube_prometheus_score;
#[derive(Debug)]
pub struct MonitoringAlertingStackScore {
//TODO add documenation to explain why its here
//keeps it open for the end user to specify which stack they want
//if it isnt default kube-prometheus
pub monitoring_stack: Vec<Box<dyn Score<MonitoringAlertingTopology>>>,
pub namespace: String,
}
impl MonitoringAlertingStackScore {
pub fn new(
monitoring_stack: Vec<Box<dyn Score<MonitoringAlertingTopology>>>,
namespace: String,
) -> Self {
Self {
monitoring_stack,
namespace,
}
}
}
impl Default for MonitoringAlertingStackScore {
fn default() -> Self {
let ns = "monitoring";
Self {
monitoring_stack: vec![Box::new(kube_prometheus_score(ns))],
namespace: ns.to_string(),
}
}
}
impl Clone for MonitoringAlertingStackScore {
fn clone(&self) -> Self {
Self {
monitoring_stack: self
.monitoring_stack
.iter()
.map(|s| s.clone_box())
.collect(),
namespace: self.namespace.clone(),
}
}
}
impl Serialize for MonitoringAlertingStackScore {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut s = serializer.serialize_struct("MonitoringAlertingStackScore", 1)?;
let monitoring_values: Vec<_> = self
.monitoring_stack
.iter()
.map(|m| m.serialize())
.collect();
s.serialize_field("monitoring", &monitoring_values)?;
s.end()
}
}
impl<T:Topology> Score<T> for MonitoringAlertingStackScore {
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(MonitoringAlertingStackInterpret {
score: MonitoringAlertingStackScore {
monitoring_stack: self
.monitoring_stack
.iter()
.map(|s| s.clone_box())
.collect(),
namespace: self.namespace.clone(),
},
})
}
fn name(&self) -> String {
format!("MonitoringAlertingStackScore")
}
}
#[derive(Debug)]
struct MonitoringAlertingStackInterpret {
pub score: MonitoringAlertingStackScore,
}
#[async_trait]
impl <T: Topology> Interpret<T> for MonitoringAlertingStackInterpret {
async fn execute(
&self,
_inventory: &Inventory,
_topology: &T,
) -> Result<Outcome, InterpretError> {
let inventory = Inventory::autoload();
let topology = MonitoringAlertingTopology::new();
let maestro = match Maestro::initialize(inventory, topology).await {
Ok(m) => m,
Err(e) => {
println!("failed to initialize Maestro: {}", e);
std::process::exit(1);
}
};
let scores_vec = self.score.monitoring_stack.clone();
for s in scores_vec{
info!("Running: {}", s.name());
maestro.interpret(s).await?;
}
Ok(Outcome::success(format!(
"monitoring stack installed in {} namespace",
self.score.namespace
)))
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}