added a monitoring stack that works with openshift/okd. Okd needs to use the cluster observability operator in order to deploy namespaced prometheuses and alertmanagers #134

Merged
letian merged 5 commits from feat/monitoring_cluster_observability into master 2025-09-08 14:22:06 +00:00
24 changed files with 1695 additions and 7 deletions

15
Cargo.lock generated
View File

@ -4631,6 +4631,21 @@ dependencies = [
"subtle", "subtle",
] ]
[[package]]
name = "rhob-application-monitoring"
version = "0.1.0"
dependencies = [
"base64 0.22.1",
"env_logger",
"harmony",
"harmony_cli",
"harmony_macros",
"harmony_types",
"log",
"tokio",
"url",
]
[[package]] [[package]]
name = "ring" name = "ring"
version = "0.17.14" version = "0.17.14"

View File

@ -0,0 +1,17 @@
[package]
name = "rhob-application-monitoring"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
tokio = { workspace = true }
log = { workspace = true }
env_logger = { workspace = true }
url = { workspace = true }
base64.workspace = true

View File

@ -0,0 +1,50 @@
use std::{path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::rhob_monitoring::RHOBMonitoring,
},
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
},
topology::K8sAnywhereTopology,
};
use harmony_types::net::Url;
#[tokio::main]
async fn main() {
let application = Arc::new(RustWebapp {
name: "test-rhob-monitoring".to_string(),
domain: Url::Url(url::Url::parse("htps://some-fake-url").unwrap()),
project_root: PathBuf::from("./webapp"), // Relative from 'harmony-path' param
framework: Some(RustWebFramework::Leptos),
service_port: 3000,
});
let discord_receiver = DiscordWebhook {
name: "test-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
};
let app = ApplicationScore {
features: vec![
Box::new(RHOBMonitoring {
application: application.clone(),
Review

Do we really want to keep different monitoring strategies? Instead of just providing one (opinionated) Monitoring feature?

Especially when some part of it is copy-paste from the previous monitoring strategy.

Do we really want to keep different monitoring strategies? Instead of just providing one (opinionated) `Monitoring` feature? Especially when some part of it is copy-paste from the previous monitoring strategy.
alert_receiver: vec![Box::new(discord_receiver)],
}),
// TODO add backups, multisite ha, etc
],
application,
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
vec![Box::new(app)],
None,
)
.await
.unwrap();
}

View File

@ -32,6 +32,7 @@ pub enum InterpretName {
K8sPrometheusCrdAlerting, K8sPrometheusCrdAlerting,
DiscoverInventoryAgent, DiscoverInventoryAgent,
CephClusterHealth, CephClusterHealth,
RHOBAlerting,
} }
impl std::fmt::Display for InterpretName { impl std::fmt::Display for InterpretName {
@ -60,6 +61,7 @@ impl std::fmt::Display for InterpretName {
InterpretName::K8sPrometheusCrdAlerting => f.write_str("K8sPrometheusCrdAlerting"), InterpretName::K8sPrometheusCrdAlerting => f.write_str("K8sPrometheusCrdAlerting"),
InterpretName::DiscoverInventoryAgent => f.write_str("DiscoverInventoryAgent"), InterpretName::DiscoverInventoryAgent => f.write_str("DiscoverInventoryAgent"),
InterpretName::CephClusterHealth => f.write_str("CephClusterHealth"), InterpretName::CephClusterHealth => f.write_str("CephClusterHealth"),
InterpretName::RHOBAlerting => f.write_str("RHOBAlerting"),
} }
} }
} }

View File

@ -14,10 +14,11 @@ use crate::{
monitoring::kube_prometheus::crd::{ monitoring::kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus, crd_alertmanager_config::CRDPrometheus,
prometheus_operator::prometheus_operator_helm_chart_score, prometheus_operator::prometheus_operator_helm_chart_score,
rhob_alertmanager_config::RHOBObservability,
}, },
prometheus::{ prometheus::{
k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore,
prometheus::PrometheusApplicationMonitoring, prometheus::PrometheusApplicationMonitoring, rhob_alerting_score::RHOBAlertingScore,
}, },
}, },
score::Score, score::Score,
@ -108,6 +109,43 @@ impl PrometheusApplicationMonitoring<CRDPrometheus> for K8sAnywhereTopology {
} }
} }
#[async_trait]
impl PrometheusApplicationMonitoring<RHOBObservability> for K8sAnywhereTopology {
async fn install_prometheus(
&self,
sender: &RHOBObservability,
inventory: &Inventory,
receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
) -> Result<PreparationOutcome, PreparationError> {
let po_result = self.ensure_cluster_observability_operator(sender).await?;
if po_result == PreparationOutcome::Noop {
debug!("Skipping Prometheus CR installation due to missing operator.");
return Ok(po_result);
}
let result = self
.get_cluster_observability_operator_prometheus_application_score(
sender.clone(),
receivers,
)
.await
.interpret(inventory, self)
.await;
match result {
Ok(outcome) => match outcome.status {
InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
details: outcome.message,
}),
InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
_ => Err(PreparationError::new(outcome.message)),
},
Err(err) => Err(PreparationError::new(err.to_string())),
}
}
}
impl Serialize for K8sAnywhereTopology { impl Serialize for K8sAnywhereTopology {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error> fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where where
@ -134,6 +172,19 @@ impl K8sAnywhereTopology {
} }
} }
async fn get_cluster_observability_operator_prometheus_application_score(
&self,
sender: RHOBObservability,
receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
) -> RHOBAlertingScore {
RHOBAlertingScore {
sender,
receivers: receivers.unwrap_or_default(),
service_monitors: vec![],
prometheus_rules: vec![],
}
}
async fn get_k8s_prometheus_application_score( async fn get_k8s_prometheus_application_score(
&self, &self,
sender: CRDPrometheus, sender: CRDPrometheus,
@ -286,6 +337,60 @@ impl K8sAnywhereTopology {
} }
} }
async fn ensure_cluster_observability_operator(
&self,
sender: &RHOBObservability,
) -> Result<PreparationOutcome, PreparationError> {
let status = Command::new("sh")
.args(["-c", "kubectl get crd -A | grep -i rhobs"])
.status()
.map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?;
if !status.success() {
if let Some(Some(k8s_state)) = self.k8s_state.get() {
match k8s_state.source {
K8sSource::LocalK3d => {
debug!("installing cluster observability operator");
todo!();
let op_score =
prometheus_operator_helm_chart_score(sender.namespace.clone());
let result = op_score.interpret(&Inventory::empty(), self).await;
return match result {
Ok(outcome) => match outcome.status {
InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
details: "installed cluster observability operator".into(),
}),
InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
_ => Err(PreparationError::new(
"failed to install cluster observability operator (unknown error)".into(),
)),
},
Err(err) => Err(PreparationError::new(err.to_string())),
};
}
K8sSource::Kubeconfig => {
debug!(
"unable to install cluster observability operator, contact cluster admin"
);
return Ok(PreparationOutcome::Noop);
}
}
} else {
warn!(
"Unable to detect k8s_state. Skipping Cluster Observability Operator install."
);
return Ok(PreparationOutcome::Noop);
}
}
debug!("Cluster Observability Operator is already present, skipping install");
Ok(PreparationOutcome::Success {
details: "cluster observability operator present in cluster".into(),
})
}
async fn ensure_prometheus_operator( async fn ensure_prometheus_operator(
&self, &self,
sender: &CRDPrometheus, sender: &CRDPrometheus,

View File

@ -1,4 +1,5 @@
mod endpoint; mod endpoint;
pub mod rhob_monitoring;
pub use endpoint::*; pub use endpoint::*;
mod monitoring; mod monitoring;

View File

@ -0,0 +1,109 @@
use std::sync::Arc;
use crate::modules::application::{Application, ApplicationFeature};
use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore;
use crate::modules::monitoring::application_monitoring::rhobs_application_monitoring_score::ApplicationRHOBMonitoringScore;
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
use crate::topology::MultiTargetTopology;
use crate::{
inventory::Inventory,
modules::monitoring::{
alert_channel::webhook_receiver::WebhookReceiver, ntfy::ntfy::NtfyScore,
},
score::Score,
topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager},
};
use crate::{
modules::prometheus::prometheus::PrometheusApplicationMonitoring,
topology::oberservability::monitoring::AlertReceiver,
};
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use harmony_types::net::Url;
use log::{debug, info};
#[derive(Debug, Clone)]
pub struct RHOBMonitoring {
pub application: Arc<dyn Application>,
pub alert_receiver: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
}
#[async_trait]
impl<
T: Topology
+ HelmCommand
+ 'static
+ TenantManager
+ K8sclient
+ MultiTargetTopology
+ std::fmt::Debug
+ PrometheusApplicationMonitoring<RHOBObservability>,
> ApplicationFeature<T> for RHOBMonitoring
{
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application");
let namespace = topology
.get_tenant_config()
.await
.map(|ns| ns.name.clone())
.unwrap_or_else(|| self.application.name());
let mut alerting_score = ApplicationRHOBMonitoringScore {
sender: RHOBObservability {
namespace: namespace.clone(),
client: topology.k8s_client().await.unwrap(),
},
application: self.application.clone(),
receivers: self.alert_receiver.clone(),
};
let ntfy = NtfyScore {
namespace: namespace.clone(),
host: "ntfy.harmonydemo.apps.ncd0.harmony.mcd".to_string(),
};
ntfy.interpret(&Inventory::empty(), topology)
.await
.map_err(|e| e.to_string())?;
let ntfy_default_auth_username = "harmony";
let ntfy_default_auth_password = "harmony";
let ntfy_default_auth_header = format!(
"Basic {}",
general_purpose::STANDARD.encode(format!(
"{ntfy_default_auth_username}:{ntfy_default_auth_password}"
))
);
debug!("ntfy_default_auth_header: {ntfy_default_auth_header}");
let ntfy_default_auth_param = general_purpose::STANDARD
.encode(ntfy_default_auth_header)
.replace("=", "");
debug!("ntfy_default_auth_param: {ntfy_default_auth_param}");
let ntfy_receiver = WebhookReceiver {
name: "ntfy-webhook".to_string(),
url: Url::Url(
url::Url::parse(
format!(
"http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}",
namespace.clone()
)
.as_str(),
)
.unwrap(),
),
};
alerting_score.receivers.push(Box::new(ntfy_receiver));
alerting_score
.interpret(&Inventory::empty(), topology)
.await
.map_err(|e| e.to_string())?;
Ok(())
}
fn name(&self) -> String {
"Monitoring".to_string()
}
}

View File

@ -4,6 +4,7 @@ use std::collections::BTreeMap;
use async_trait::async_trait; use async_trait::async_trait;
use k8s_openapi::api::core::v1::Secret; use k8s_openapi::api::core::v1::Secret;
use kube::api::ObjectMeta; use kube::api::ObjectMeta;
use log::debug;
use serde::Serialize; use serde::Serialize;
use serde_json::json; use serde_json::json;
use serde_yaml::{Mapping, Value}; use serde_yaml::{Mapping, Value};
@ -11,6 +12,7 @@ use serde_yaml::{Mapping, Value};
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{ use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus,
}; };
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
use crate::{ use crate::{
interpret::{InterpretError, Outcome}, interpret::{InterpretError, Outcome},
modules::monitoring::{ modules::monitoring::{
@ -30,6 +32,71 @@ pub struct DiscordWebhook {
pub url: Url, pub url: Url,
} }
#[async_trait]
impl AlertReceiver<RHOBObservability> for DiscordWebhook {
async fn install(&self, sender: &RHOBObservability) -> Result<Outcome, InterpretError> {
let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
}
]
}
]
}),
};
let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alertmanager_configs yaml:\n{:#?}",
serde_yaml::to_string(&alertmanager_configs)
);
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed rhob-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<RHOBObservability>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait] #[async_trait]
impl AlertReceiver<CRDPrometheus> for DiscordWebhook { impl AlertReceiver<CRDPrometheus> for DiscordWebhook {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {

View File

@ -11,8 +11,8 @@ use crate::{
interpret::{InterpretError, Outcome}, interpret::{InterpretError, Outcome},
modules::monitoring::{ modules::monitoring::{
kube_prometheus::{ kube_prometheus::{
crd::crd_alertmanager_config::{ crd::{
AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability,
}, },
prometheus::{KubePrometheus, KubePrometheusReceiver}, prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig}, types::{AlertChannelConfig, AlertManagerChannelConfig},
@ -30,9 +30,9 @@ pub struct WebhookReceiver {
} }
#[async_trait] #[async_trait]
impl AlertReceiver<CRDPrometheus> for WebhookReceiver { impl AlertReceiver<RHOBObservability> for WebhookReceiver {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &RHOBObservability) -> Result<Outcome, InterpretError> {
let spec = AlertmanagerConfigSpec { let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec {
data: json!({ data: json!({
"route": { "route": {
"receiver": self.name, "receiver": self.name,
@ -50,7 +50,68 @@ impl AlertReceiver<CRDPrometheus> for WebhookReceiver {
}), }),
}; };
let alertmanager_configs = AlertmanagerConfig { let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(sender.namespace.clone()),
..Default::default()
},
spec,
};
debug!(
"alert manager configs: \n{:#?}",
alertmanager_configs.clone()
);
sender
.client
.apply(&alertmanager_configs, Some(&sender.namespace))
.await?;
Ok(Outcome::success(format!(
"installed rhob-alertmanagerconfigs for {}",
self.name
)))
}
fn name(&self) -> String {
"webhook-receiver".to_string()
}
fn clone_box(&self) -> Box<dyn AlertReceiver<RHOBObservability>> {
Box::new(self.clone())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[async_trait]
impl AlertReceiver<CRDPrometheus> for WebhookReceiver {
async fn install(&self, sender: &CRDPrometheus) -> Result<Outcome, InterpretError> {
let spec = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfigSpec {
data: json!({
"route": {
"receiver": self.name,
},
"receivers": [
{
"name": self.name,
"webhookConfigs": [
{
"url": self.url,
}
]
}
]
}),
};
let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfig {
metadata: ObjectMeta { metadata: ObjectMeta {
name: Some(self.name.clone()), name: Some(self.name.clone()),
labels: Some(std::collections::BTreeMap::from([( labels: Some(std::collections::BTreeMap::from([(
@ -115,6 +176,7 @@ impl PrometheusReceiver for WebhookReceiver {
self.get_config().await self.get_config().await
} }
} }
#[async_trait] #[async_trait]
impl AlertReceiver<KubePrometheus> for WebhookReceiver { impl AlertReceiver<KubePrometheus> for WebhookReceiver {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> { async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {

View File

@ -1 +1,2 @@
pub mod application_monitoring_score; pub mod application_monitoring_score;
pub mod rhobs_application_monitoring_score;

View File

@ -0,0 +1,94 @@
use std::sync::Arc;
use async_trait::async_trait;
use serde::Serialize;
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
modules::{
application::Application,
monitoring::kube_prometheus::crd::{
crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability,
},
prometheus::prometheus::PrometheusApplicationMonitoring,
},
score::Score,
topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver},
};
use harmony_types::id::Id;
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationRHOBMonitoringScore {
pub sender: RHOBObservability,
pub application: Arc<dyn Application>,
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
}
impl<T: Topology + PrometheusApplicationMonitoring<RHOBObservability>> Score<T>
for ApplicationRHOBMonitoringScore
{
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
Box::new(ApplicationRHOBMonitoringInterpret {
score: self.clone(),
})
}
fn name(&self) -> String {
format!(
"{} monitoring [ApplicationRHOBMonitoringScore]",
self.application.name()
)
}
}
#[derive(Debug)]
pub struct ApplicationRHOBMonitoringInterpret {
score: ApplicationRHOBMonitoringScore,
}
#[async_trait]
impl<T: Topology + PrometheusApplicationMonitoring<RHOBObservability>> Interpret<T>
for ApplicationRHOBMonitoringInterpret
{
async fn execute(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let result = topology
.install_prometheus(
&self.score.sender,
inventory,
Some(self.score.receivers.clone()),
)
.await;
match result {
Ok(outcome) => match outcome {
PreparationOutcome::Success { details: _ } => {
Ok(Outcome::success("Prometheus installed".into()))
}
PreparationOutcome::Noop => Ok(Outcome::noop()),
},
Err(err) => Err(InterpretError::from(err)),
}
}
fn get_name(&self) -> InterpretName {
InterpretName::ApplicationMonitoring
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}

View File

@ -7,5 +7,15 @@ pub mod crd_prometheuses;
pub mod grafana_default_dashboard; pub mod grafana_default_dashboard;
pub mod grafana_operator; pub mod grafana_operator;
pub mod prometheus_operator; pub mod prometheus_operator;
pub mod rhob_alertmanager_config;
pub mod rhob_alertmanagers;
pub mod rhob_cluster_observability_operator;
pub mod rhob_default_rules;
pub mod rhob_grafana;
pub mod rhob_monitoring_stack;
pub mod rhob_prometheus_rules;
pub mod rhob_prometheuses;
pub mod rhob_role;
pub mod rhob_service_monitor;
pub mod role; pub mod role;
pub mod service_monitor; pub mod service_monitor;

View File

@ -0,0 +1,50 @@
use std::sync::Arc;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::topology::{
k8s::K8sClient,
oberservability::monitoring::{AlertReceiver, AlertSender},
};
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1alpha1",
kind = "AlertmanagerConfig",
plural = "alertmanagerconfigs",
namespaced
)]
pub struct AlertmanagerConfigSpec {
#[serde(flatten)]
pub data: serde_json::Value,
}
#[derive(Debug, Clone, Serialize)]
pub struct RHOBObservability {
pub namespace: String,
pub client: Arc<K8sClient>,
}
impl AlertSender for RHOBObservability {
fn name(&self) -> String {
"RHOBAlertManager".to_string()
}
}
impl Clone for Box<dyn AlertReceiver<RHOBObservability>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
impl Serialize for Box<dyn AlertReceiver<RHOBObservability>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}

View File

@ -0,0 +1,52 @@
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use super::crd_prometheuses::LabelSelector;
/// Rust CRD for `Alertmanager` from Prometheus Operator
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1",
kind = "Alertmanager",
plural = "alertmanagers",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct AlertmanagerSpec {
/// Number of replicas for HA
pub replicas: i32,
/// Selectors for AlertmanagerConfig CRDs
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alertmanager_config_namespace_selector: Option<LabelSelector>,
/// Optional pod template metadata (annotations, labels)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_metadata: Option<LabelSelector>,
/// Optional topology spread settings
#[serde(default, skip_serializing_if = "Option::is_none")]
pub version: Option<String>,
}
impl Default for AlertmanagerSpec {
fn default() -> Self {
AlertmanagerSpec {
replicas: 1,
// Match all AlertmanagerConfigs in the same namespace
alertmanager_config_namespace_selector: None,
// Empty selector matches all AlertmanagerConfigs in that namespace
alertmanager_config_selector: Some(LabelSelector::default()),
pod_metadata: None,
version: None,
}
}
}

View File

@ -0,0 +1,22 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
//TODO package chart or something for COO okd
pub fn rhob_cluster_observability_operator() -> HelmChartScore {
HelmChartScore {
namespace: None,
release_name: NonBlankString::from_str("").unwrap(),
letian marked this conversation as resolved
Review

shouldn't it have a release name?

shouldn't it have a release name?
chart_name: NonBlankString::from_str(
"oci://hub.nationtech.io/harmony/nt-prometheus-operator",
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: None,
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@ -0,0 +1,26 @@
use crate::modules::{
monitoring::kube_prometheus::crd::rhob_prometheus_rules::Rule,
prometheus::alerts::k8s::{
deployment::alert_deployment_unavailable,
pod::{alert_container_restarting, alert_pod_not_ready, pod_failed},
pvc::high_pvc_fill_rate_over_two_days,
service::alert_service_down,
},
};
pub fn build_default_application_rules() -> Vec<Rule> {
let pod_failed: Rule = pod_failed().into();
let container_restarting: Rule = alert_container_restarting().into();
let pod_not_ready: Rule = alert_pod_not_ready().into();
let service_down: Rule = alert_service_down().into();
let deployment_unavailable: Rule = alert_deployment_unavailable().into();
let high_pvc_fill_rate: Rule = high_pvc_fill_rate_over_two_days().into();
vec![
pod_failed,
container_restarting,
pod_not_ready,
service_down,
deployment_unavailable,
high_pvc_fill_rate,
]
}

View File

@ -0,0 +1,153 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "Grafana",
plural = "grafanas",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub config: Option<GrafanaConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ingress: Option<GrafanaIngress>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub persistence: Option<GrafanaPersistence>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resources: Option<ResourceRequirements>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub log: Option<GrafanaLogConfig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub security: Option<GrafanaSecurityConfig>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaLogConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub mode: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub level: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaSecurityConfig {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_user: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub admin_password: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaIngress {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hosts: Option<Vec<String>>,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaPersistence {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enabled: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub storage_class_name: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub size: Option<String>,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDashboard",
plural = "grafanadashboards",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDashboardSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resync_period: Option<String>,
pub instance_selector: LabelSelector,
pub json: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "grafana.integreatly.org",
version = "v1beta1",
kind = "GrafanaDatasource",
plural = "grafanadatasources",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceSpec {
pub instance_selector: LabelSelector,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub allow_cross_namespace_import: Option<bool>,
pub datasource: GrafanaDatasourceConfig,
}
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct GrafanaDatasourceConfig {
pub access: String,
pub database: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub json_data: Option<BTreeMap<String, String>>,
pub name: String,
pub r#type: String,
pub url: String,
}
// ------------------------------------------------------------------------------------------------
#[derive(Serialize, Deserialize, Debug, Clone, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct ResourceRequirements {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub limits: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub requests: BTreeMap<String, String>,
}

View File

@ -0,0 +1,41 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector;
/// MonitoringStack CRD for monitoring.rhobs/v1alpha1
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1alpha1",
kind = "MonitoringStack",
plural = "monitoringstacks",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct MonitoringStackSpec {
/// Verbosity of logs (e.g. "debug", "info", "warn", "error").
#[serde(default, skip_serializing_if = "Option::is_none")]
pub log_level: Option<String>,
/// Retention period for Prometheus TSDB data (e.g. "1d").
#[serde(default, skip_serializing_if = "Option::is_none")]
pub retention: Option<String>,
/// Resource selector for workloads monitored by this stack.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub resource_selector: Option<LabelSelector>,
}
impl Default for MonitoringStackSpec {
fn default() -> Self {
MonitoringStackSpec {
log_level: Some("info".into()),
retention: Some("7d".into()),
resource_selector: Some(LabelSelector::default()),
}
}
}

View File

@ -0,0 +1,57 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1",
kind = "PrometheusRule",
plural = "prometheusrules",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusRuleSpec {
pub groups: Vec<RuleGroup>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct RuleGroup {
pub name: String,
pub rules: Vec<Rule>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct Rule {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alert: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expr: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub for_: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub labels: Option<std::collections::BTreeMap<String, String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub annotations: Option<std::collections::BTreeMap<String, String>>,
}
impl From<PrometheusAlertRule> for Rule {
fn from(value: PrometheusAlertRule) -> Self {
Rule {
alert: Some(value.alert),
expr: Some(value.expr),
for_: value.r#for,
labels: Some(value.labels.into_iter().collect::<BTreeMap<_, _>>()),
annotations: Some(value.annotations.into_iter().collect::<BTreeMap<_, _>>()),
}
}
}

View File

@ -0,0 +1,118 @@
use std::collections::BTreeMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::types::Operator;
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1",
kind = "Prometheus",
plural = "prometheuses",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusSpec {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alerting: Option<PrometheusSpecAlerting>,
pub service_account_name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_namespace_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_discovery_role: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pod_monitor_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_selector: Option<LabelSelector>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule_namespace_selector: Option<LabelSelector>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct NamespaceSelector {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_names: Vec<String>,
}
/// Contains alerting configuration, specifically Alertmanager endpoints.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct PrometheusSpecAlerting {
#[serde(skip_serializing_if = "Option::is_none")]
pub alertmanagers: Option<Vec<AlertmanagerEndpoints>>,
}
/// Represents an Alertmanager endpoint configuration used by Prometheus.
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
pub struct AlertmanagerEndpoints {
/// Name of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub name: Option<String>,
/// Namespace of the Alertmanager Service.
#[serde(skip_serializing_if = "Option::is_none")]
pub namespace: Option<String>,
/// Port to access on the Alertmanager Service (e.g. "web").
#[serde(skip_serializing_if = "Option::is_none")]
pub port: Option<String>,
/// Scheme to use for connecting (e.g. "http").
#[serde(skip_serializing_if = "Option::is_none")]
pub scheme: Option<String>,
// Other fields like `tls_config`, `path_prefix`, etc., can be added if needed.
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelector {
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub match_labels: BTreeMap<String, String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub match_expressions: Vec<LabelSelectorRequirement>,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelectorRequirement {
pub key: String,
pub operator: Operator,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub values: Vec<String>,
}
impl Default for PrometheusSpec {
fn default() -> Self {
PrometheusSpec {
alerting: None,
service_account_name: "prometheus".into(),
// null means "only my namespace"
service_monitor_namespace_selector: None,
// empty selector means match all ServiceMonitors in that namespace
service_monitor_selector: Some(LabelSelector::default()),
service_discovery_role: Some("Endpoints".into()),
pod_monitor_selector: None,
rule_selector: None,
rule_namespace_selector: Some(LabelSelector::default()),
}
}
}

View File

@ -0,0 +1,62 @@
use k8s_openapi::api::{
core::v1::ServiceAccount,
rbac::v1::{PolicyRule, Role, RoleBinding, RoleRef, Subject},
};
use kube::api::ObjectMeta;
pub fn build_prom_role(role_name: String, namespace: String) -> Role {
Role {
metadata: ObjectMeta {
name: Some(role_name),
namespace: Some(namespace),
..Default::default()
},
rules: Some(vec![PolicyRule {
api_groups: Some(vec!["".into()]), // core API group
resources: Some(vec!["services".into(), "endpoints".into(), "pods".into()]),
verbs: vec!["get".into(), "list".into(), "watch".into()],
..Default::default()
}]),
}
}
pub fn build_prom_rolebinding(
role_name: String,
namespace: String,
service_account_name: String,
) -> RoleBinding {
RoleBinding {
metadata: ObjectMeta {
name: Some(format!("{}-rolebinding", role_name)),
namespace: Some(namespace.clone()),
..Default::default()
},
role_ref: RoleRef {
api_group: "rbac.authorization.k8s.io".into(),
kind: "Role".into(),
name: role_name,
},
subjects: Some(vec![Subject {
kind: "ServiceAccount".into(),
name: service_account_name,
namespace: Some(namespace.clone()),
..Default::default()
}]),
}
}
pub fn build_prom_service_account(
service_account_name: String,
namespace: String,
) -> ServiceAccount {
ServiceAccount {
automount_service_account_token: None,
image_pull_secrets: None,
metadata: ObjectMeta {
name: Some(service_account_name),
namespace: Some(namespace),
..Default::default()
},
secrets: None,
}
}

View File

@ -0,0 +1,87 @@
use std::collections::HashMap;
use kube::CustomResource;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use crate::modules::monitoring::kube_prometheus::types::{
HTTPScheme, MatchExpression, NamespaceSelector, Operator, Selector,
ServiceMonitor as KubeServiceMonitor, ServiceMonitorEndpoint,
};
/// This is the top-level struct for the ServiceMonitor Custom Resource.
/// The `#[derive(CustomResource)]` macro handles all the boilerplate for you,
/// including the `impl Resource`.
#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)]
#[kube(
group = "monitoring.rhobs",
version = "v1",
kind = "ServiceMonitor",
plural = "servicemonitors",
namespaced
)]
#[serde(rename_all = "camelCase")]
pub struct ServiceMonitorSpec {
/// A label selector to select services to monitor.
pub selector: Selector,
/// A list of endpoints on the selected services to be monitored.
pub endpoints: Vec<ServiceMonitorEndpoint>,
/// Selector to select which namespaces the Kubernetes Endpoints objects
/// are discovered from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub namespace_selector: Option<NamespaceSelector>,
/// The label to use to retrieve the job name from.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub job_label: Option<String>,
/// Pod-based target labels to transfer from the Kubernetes Pod onto the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pod_target_labels: Vec<String>,
/// TargetLabels transfers labels on the Kubernetes Service object to the target.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub target_labels: Vec<String>,
}
impl Default for ServiceMonitorSpec {
fn default() -> Self {
let labels = HashMap::new();
Self {
selector: Selector {
match_labels: { labels },
match_expressions: vec![MatchExpression {
key: "app.kubernetes.io/name".into(),
operator: Operator::Exists,
values: vec![],
}],
},
endpoints: vec![ServiceMonitorEndpoint {
port: Some("http".to_string()),
path: Some("/metrics".into()),
interval: Some("30s".into()),
scheme: Some(HTTPScheme::HTTP),
..Default::default()
}],
namespace_selector: None, // only the same namespace
job_label: Some("app".into()),
pod_target_labels: vec![],
target_labels: vec![],
}
}
}
impl From<KubeServiceMonitor> for ServiceMonitorSpec {
fn from(value: KubeServiceMonitor) -> Self {
Self {
selector: value.selector,
endpoints: value.endpoints,
namespace_selector: value.namespace_selector,
job_label: value.job_label,
pod_target_labels: value.pod_target_labels,
target_labels: value.target_labels,
}
}
}

View File

@ -2,3 +2,4 @@ pub mod alerts;
pub mod k8s_prometheus_alerting_score; pub mod k8s_prometheus_alerting_score;
#[allow(clippy::module_inception)] #[allow(clippy::module_inception)]
pub mod prometheus; pub mod prometheus;
pub mod rhob_alerting_score;

View File

@ -0,0 +1,486 @@
use std::fs;
use std::{collections::BTreeMap, sync::Arc};
use tempfile::tempdir;
use async_trait::async_trait;
use kube::api::ObjectMeta;
use log::{debug, info};
use serde::Serialize;
use std::process::Command;
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanagers::{
Alertmanager, AlertmanagerSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::rhob_grafana::{
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
GrafanaDatasourceSpec, GrafanaSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::rhob_monitoring_stack::{
MonitoringStack, MonitoringStackSpec,
};
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheus_rules::{
PrometheusRule, PrometheusRuleSpec, RuleGroup,
};
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector;
use crate::modules::monitoring::kube_prometheus::crd::rhob_service_monitor::{
ServiceMonitor, ServiceMonitorSpec,
};
use crate::score::Score;
use crate::topology::oberservability::monitoring::AlertReceiver;
use crate::topology::{K8sclient, Topology, k8s::K8sClient};
use crate::{
data::Version,
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
inventory::Inventory,
};
use harmony_types::id::Id;
use super::prometheus::PrometheusApplicationMonitoring;
#[derive(Clone, Debug, Serialize)]
pub struct RHOBAlertingScore {
pub sender: RHOBObservability,
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<RHOBObservability>> Score<T>
for RHOBAlertingScore
{
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
Box::new(RHOBAlertingInterpret {
sender: self.sender.clone(),
receivers: self.receivers.clone(),
service_monitors: self.service_monitors.clone(),
prometheus_rules: self.prometheus_rules.clone(),
})
}
fn name(&self) -> String {
"RHOB alerting [RHOBAlertingScore]".into()
}
}
#[derive(Clone, Debug)]
pub struct RHOBAlertingInterpret {
pub sender: RHOBObservability,
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
pub service_monitors: Vec<ServiceMonitor>,
pub prometheus_rules: Vec<RuleGroup>,
}
#[async_trait]
impl<T: Topology + K8sclient + PrometheusApplicationMonitoring<RHOBObservability>> Interpret<T>
for RHOBAlertingInterpret
{
async fn execute(
&self,
_inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let client = topology.k8s_client().await.unwrap();
self.ensure_grafana_operator().await?;
self.install_prometheus(&client).await?;
self.install_client_kube_metrics().await?;
self.install_grafana(&client).await?;
self.install_receivers(&self.sender, &self.receivers)
.await?;
self.install_rules(&self.prometheus_rules, &client).await?;
self.install_monitors(self.service_monitors.clone(), &client)
.await?;
Ok(Outcome::success(
"K8s monitoring components installed".to_string(),
))
}
fn get_name(&self) -> InterpretName {
InterpretName::RHOBAlerting
}
fn get_version(&self) -> Version {
todo!()
}
fn get_status(&self) -> InterpretStatus {
todo!()
}
fn get_children(&self) -> Vec<Id> {
todo!()
}
}
impl RHOBAlertingInterpret {
async fn crd_exists(&self, crd: &str) -> bool {
let status = Command::new("sh")
.args(["-c", &format!("kubectl get crd -A | grep -i {crd}")])
.status()
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))
.unwrap();
status.success()
}
async fn install_chart(
&self,
chart_path: String,
chart_name: String,
) -> Result<(), InterpretError> {
let temp_dir =
tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?;
let temp_path = temp_dir.path().to_path_buf();
debug!("Using temp directory: {}", temp_path.display());
let chart = format!("{}/{}", chart_path, chart_name);
let pull_output = Command::new("helm")
.args(["pull", &chart, "--destination", temp_path.to_str().unwrap()])
.output()
.map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?;
if !pull_output.status.success() {
return Err(InterpretError::new(format!(
"Helm pull failed: {}",
String::from_utf8_lossy(&pull_output.stderr)
)));
}
let tgz_path = fs::read_dir(&temp_path)
.unwrap()
.filter_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.extension()? == "tgz" {
Some(path)
} else {
None
}
})
.next()
.ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?;
debug!("Installing chart from: {}", tgz_path.display());
let install_output = Command::new("helm")
.args([
"upgrade",
"--install",
&chart_name,
tgz_path.to_str().unwrap(),
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--wait",
"--atomic",
])
.output()
.map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?;
if !install_output.status.success() {
return Err(InterpretError::new(format!(
"Helm install failed: {}",
String::from_utf8_lossy(&install_output.stderr)
)));
}
debug!(
"Installed chart {}/{} in namespace: {}",
&chart_path,
&chart_name,
self.sender.namespace.clone()
);
Ok(())
}
async fn ensure_grafana_operator(&self) -> Result<Outcome, InterpretError> {
let _ = Command::new("helm")
.args([
"repo",
"add",
letian marked this conversation as resolved
Review

considering we run the update below, I don't think we need to keep this

considering we run the update below, I don't think we need to keep this
"grafana-operator",
"https://grafana.github.io/helm-charts",
])
.output()
.unwrap();
let _ = Command::new("helm")
.args(["repo", "update"])
.output()
.unwrap();
let output = Command::new("helm")
.args([
"install",
"grafana-operator",
"grafana-operator/grafana-operator",
"--namespace",
&self.sender.namespace.clone(),
"--create-namespace",
"--set",
"namespaceScope=true",
])
.output()
.unwrap();
if !output.status.success() {
return Err(InterpretError::new(format!(
"helm install failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
)));
}
Ok(Outcome::success(format!(
"installed grafana operator in ns {}",
self.sender.namespace.clone()
)))
}
async fn install_prometheus(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
debug!(
"installing crd-prometheuses in namespace {}",
self.sender.namespace.clone()
);
let stack = MonitoringStack {
metadata: ObjectMeta {
name: Some(format!("{}-monitoring", self.sender.namespace.clone()).into()),
namespace: Some(self.sender.namespace.clone()),
labels: Some([("coo".into(), "example".into())].into()),
..Default::default()
},
spec: MonitoringStackSpec {
log_level: Some("debug".into()),
retention: Some("1d".into()),
resource_selector: Some(LabelSelector {
match_labels: [("app".into(), "demo".into())].into(),
..Default::default()
}),
},
};
client
.apply(&stack, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
info!("installed rhob monitoring stack",);
Ok(Outcome::success(format!(
"successfully deployed rhob-prometheus {:#?}",
stack
)))
}
async fn install_alert_manager(
&self,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let am = Alertmanager {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([(
"alertmanagerConfig".to_string(),
"enabled".to_string(),
)])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: AlertmanagerSpec::default(),
};
client
.apply(&am, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed service monitor {:#?}",
am.metadata.name
)))
}
async fn install_monitors(
&self,
mut monitors: Vec<ServiceMonitor>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let default_service_monitor = ServiceMonitor {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("client".to_string(), "prometheus".to_string()),
(
"app.kubernetes.io/name".to_string(),
"kube-state-metrics".to_string(),
),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: ServiceMonitorSpec::default(),
};
monitors.push(default_service_monitor);
for monitor in monitors.iter() {
client
.apply(monitor, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
}
Ok(Outcome::success(
"succesfully deployed service monitors".to_string(),
))
}
async fn install_rules(
&self,
#[allow(clippy::ptr_arg)] rules: &Vec<RuleGroup>,
client: &Arc<K8sClient>,
) -> Result<Outcome, InterpretError> {
let mut prom_rule_spec = PrometheusRuleSpec {
groups: rules.clone(),
};
let default_rules_group = RuleGroup {
name: "default-rules".to_string(),
rules: crate::modules::monitoring::kube_prometheus::crd::rhob_default_rules::build_default_application_rules(),
};
prom_rule_spec.groups.push(default_rules_group);
let prom_rules = PrometheusRule {
metadata: ObjectMeta {
name: Some(self.sender.namespace.clone()),
labels: Some(std::collections::BTreeMap::from([
("alertmanagerConfig".to_string(), "enabled".to_string()),
("role".to_string(), "prometheus-rule".to_string()),
])),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: prom_rule_spec,
};
client
.apply(&prom_rules, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed rules {:#?}",
prom_rules.metadata.name
)))
}
async fn install_client_kube_metrics(&self) -> Result<Outcome, InterpretError> {
self.install_chart(
"oci://hub.nationtech.io/harmony".to_string(),
"nt-kube-metrics".to_string(),
)
.await?;
Ok(Outcome::success(format!(
"Installed client kube metrics in ns {}",
&self.sender.namespace.clone()
)))
}
async fn install_grafana(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
let mut label = BTreeMap::new();
label.insert("dashboards".to_string(), "grafana".to_string());
let labels = LabelSelector {
match_labels: label.clone(),
match_expressions: vec![],
};
let mut json_data = BTreeMap::new();
json_data.insert("timeInterval".to_string(), "5s".to_string());
let namespace = self.sender.namespace.clone();
let json = build_default_dashboard(&namespace);
let graf_data_source = GrafanaDatasource {
metadata: ObjectMeta {
name: Some(format!(
"grafana-datasource-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDatasourceSpec {
instance_selector: labels.clone(),
allow_cross_namespace_import: Some(false),
datasource: GrafanaDatasourceConfig {
access: "proxy".to_string(),
database: Some("prometheus".to_string()),
json_data: Some(json_data),
//this is fragile
name: format!("prometheus-{}-0", self.sender.namespace.clone()),
r#type: "prometheus".to_string(),
url: format!(
"http://prometheus-operated.{}.svc.cluster.local:9090",
self.sender.namespace.clone()
),
},
},
};
client
.apply(&graf_data_source, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let graf_dashboard = GrafanaDashboard {
metadata: ObjectMeta {
name: Some(format!(
"grafana-dashboard-{}",
self.sender.namespace.clone()
)),
namespace: Some(self.sender.namespace.clone()),
..Default::default()
},
spec: GrafanaDashboardSpec {
resync_period: Some("30s".to_string()),
instance_selector: labels.clone(),
json,
},
};
client
.apply(&graf_dashboard, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
let grafana = Grafana {
metadata: ObjectMeta {
name: Some(format!("grafana-{}", self.sender.namespace.clone())),
namespace: Some(self.sender.namespace.clone()),
labels: Some(label.clone()),
..Default::default()
},
spec: GrafanaSpec {
config: None,
admin_user: None,
admin_password: None,
ingress: None,
persistence: None,
resources: None,
},
};
client
.apply(&grafana, Some(&self.sender.namespace.clone()))
.await
.map_err(|e| InterpretError::new(e.to_string()))?;
Ok(Outcome::success(format!(
"successfully deployed grafana instance {:#?}",
grafana.metadata.name
)))
}
async fn install_receivers(
&self,
sender: &RHOBObservability,
receivers: &Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
) -> Result<Outcome, InterpretError> {
for receiver in receivers.iter() {
receiver.install(sender).await.map_err(|err| {
InterpretError::new(format!("failed to install receiver: {}", err))
})?;
}
Ok(Outcome::success("successfully deployed receivers".into()))
}
}