monitoring-alerting #30

Merged
wjro merged 12 commits from monitoring-alerting into master 2025-05-06 17:50:57 +00:00
6 changed files with 306 additions and 0 deletions

View File

@ -1,3 +1,4 @@
pub mod monitoring_alerting;
mod ha_cluster;
mod host_binding;
mod http;

View File

@ -0,0 +1,108 @@
use std::sync::Arc;
use log::warn;
use tokio::sync::OnceCell;
use k8s_openapi::api::core::v1::Pod;
use kube::{
Client,
api::{Api, ListParams},
};
use async_trait::async_trait;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
maestro::Maestro,
modules::monitoring::monitoring_alerting::MonitoringAlertingStackScore,
score::Score,
};
use super::{HelmCommand, K8sAnywhereTopology, Topology, k8s::K8sClient};
#[derive(Clone, Debug)]
struct MonitoringState {
message: String,
}
#[derive(Debug)]
pub struct MonitoringAlertingTopology {
monitoring_state: OnceCell<Option<MonitoringState>>,
}
impl MonitoringAlertingTopology {
pub fn new() -> Self {
Self {
monitoring_state: OnceCell::new(),
}
}
async fn get_monitoring_state(&self) -> Result<Option<MonitoringState>, InterpretError> {
let client = Client::try_default()
.await
.map_err(|e| InterpretError::new(format!("Kubernetes client error: {}", e)))?;
for ns in &["monitoring", "openshift-monitoring"] {
let pods: Api<Pod> = Api::namespaced(client.clone(), ns);
//TODO hardcoding the label is a problem
//check all pods are ready
let lp = ListParams::default().labels("app.kubernetes.io/name=prometheus");
match pods.list(&lp).await {
Ok(pod_list) => {
for p in pod_list.items {
if let Some(status) = p.status {
if let Some(conditions) = status.conditions {
if conditions
.iter()
.any(|c| c.type_ == "Ready" && c.status == "True")
{
return Ok(Some(MonitoringState {
message: format!(
"Prometheus is ready in namespace: {}",
ns
),
}));
}
}
}
}
}
Err(e) => {
warn!("Failed to query pods in ns {}: {}", ns, e);
}
}
}
Ok(None)
}
}
impl<T: Topology> Clone for Box<dyn Score<T>> {
fn clone(&self) -> Box<dyn Score<T>> {
self.clone_box()
}
}
#[async_trait]
impl Topology for MonitoringAlertingTopology {
fn name(&self) -> &str {
"MonitoringAlertingTopology"
}
async fn ensure_ready(&self) -> Result<Outcome, InterpretError> {
if let Some(state) = self.get_monitoring_state().await? {
// Monitoring stack is already ready — stop app.
println!("{}", state.message);
std::process::exit(0);
}
// Monitoring not found — proceed with installation.
Ok(Outcome::success(
"Monitoring stack installation started.".to_string(),
))
}
}
impl HelmCommand for MonitoringAlertingTopology {}

View File

@ -10,3 +10,4 @@ pub mod load_balancer;
pub mod okd;
pub mod opnsense;
pub mod tftp;
pub mod monitoring;

View File

@ -0,0 +1,49 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn kube_prometheus_score(ns: &str) -> HelmChartScore {
//TODO this should be make into a rule with default formatting that can be easily passed as a vec
//to the overrides or something leaving the user to deal with formatting here seems bad
let values = r#"
additionalPrometheusRulesMap:
pvc-alerts:
groups:
- name: pvc-alerts
rules:
- alert: 'PVC Fill Over 95 Percent In 2 Days'
expr: |
(
kubelet_volume_stats_used_bytes
/
kubelet_volume_stats_capacity_bytes
) > 0.95
AND
predict_linear(kubelet_volume_stats_used_bytes[2d], 2 * 24 * 60 * 60)
/
kubelet_volume_stats_capacity_bytes
> 0.95
for: 1m
labels:
severity: warning
annotations:
description: The PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is predicted to fill over 95% in less than 2 days.
title: PVC {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} will fill over 95% in less than 2 days
"#;
HelmChartScore {
namespace: Some(NonBlankString::from_str(ns).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack", //use kube prometheus chart which includes grafana, prometheus, alert
//manager, etc
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: Some(values.to_string()),
create_namespace: true,
install_only: true,
}
}

View File

@ -0,0 +1,3 @@
pub mod monitoring_alerting;
mod kube_prometheus;

View File

@ -0,0 +1,144 @@
use async_trait::async_trait;
use log::info;
use serde::Serialize;
use crate::{
data::{Id, Version},
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
maestro::Maestro,
score::{CloneBoxScore, Score},
topology::{HelmCommand, Topology, monitoring_alerting::MonitoringAlertingTopology},
};
use super::kube_prometheus::kube_prometheus_score;
#[derive(Debug)]
pub struct MonitoringAlertingStackScore {
//TODO add documenation to explain why its here
//keeps it open for the end user to specify which stack they want
Outdated
Review

Why does the MonitoringAlertingStackScore need a Vec<Box<dyn Score<MonitoringAlertingTopology>>>? It seems like it's never used for anything.

A score's variables should be what is needed to set it up, i.e. any values, rules, etc you want to pass in for the interpret to use to spin up the score

Why does the `MonitoringAlertingStackScore` need a `Vec<Box<dyn Score<MonitoringAlertingTopology>>>`? It seems like it's never used for anything. A score's variables should be what is needed to set it up, i.e. any values, rules, etc you want to pass in for the interpret to use to spin up the score
//if it isnt default kube-prometheus
pub monitoring_stack: Vec<Box<dyn Score<MonitoringAlertingTopology>>>,
pub namespace: String,
}
impl MonitoringAlertingStackScore {
pub fn new(
monitoring_stack: Vec<Box<dyn Score<MonitoringAlertingTopology>>>,
namespace: String,
) -> Self {
Self {
monitoring_stack,
namespace,
}
}
}
impl Default for MonitoringAlertingStackScore {
fn default() -> Self {
let ns = "monitoring";
Self {
monitoring_stack: vec![Box::new(kube_prometheus_score(ns))],
namespace: ns.to_string(),
}
}
}
impl Clone for MonitoringAlertingStackScore {
fn clone(&self) -> Self {
Self {
monitoring_stack: self
.monitoring_stack
.iter()
.map(|s| s.clone_box())
.collect(),
namespace: self.namespace.clone(),
}
}
}
impl Serialize for MonitoringAlertingStackScore {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeStruct;
let mut s = serializer.serialize_struct("MonitoringAlertingStackScore", 1)?;
let monitoring_values: Vec<_> = self
.monitoring_stack
.iter()
.map(|m| m.serialize())
.collect();
s.serialize_field("monitoring", &monitoring_values)?;
s.end()
}
}
impl<T:Topology> Score<T> for MonitoringAlertingStackScore {
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(MonitoringAlertingStackInterpret {
score: MonitoringAlertingStackScore {
monitoring_stack: self
.monitoring_stack
.iter()
.map(|s| s.clone_box())
.collect(),
namespace: self.namespace.clone(),
},
})
}
fn name(&self) -> String {
format!("MonitoringAlertingStackScore")
}
}
#[derive(Debug)]
struct MonitoringAlertingStackInterpret {
pub score: MonitoringAlertingStackScore,
}
#[async_trait]
impl <T: Topology> Interpret<T> for MonitoringAlertingStackInterpret {
async fn execute(
&self,
_inventory: &Inventory,
_topology: &T,
) -> Result<Outcome, InterpretError> {
let inventory = Inventory::autoload();
let topology = MonitoringAlertingTopology::new();
let maestro = match Maestro::initialize(inventory, topology).await {
Ok(m) => m,
Err(e) => {
println!("failed to initialize Maestro: {}", e);
std::process::exit(1);
}
};
let scores_vec = self.score.monitoring_stack.clone();
for s in scores_vec{
info!("Running: {}", s.name());
maestro.interpret(s).await?;
}
Ok(Outcome::success(format!(
"monitoring stack installed in {} namespace",
self.score.namespace
)))
}
fn get_name(&self) -> InterpretName {
todo!()
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}