feat: Initial setup for monitoring and alerting #48
@@ -49,3 +49,4 @@ fqdn = { version = "0.4.6", features = [
|
||||
"serde",
|
||||
] }
|
||||
temp-dir = "0.1.14"
|
||||
dyn-clone = "1.0.19"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{io::Error, process::Command, sync::Arc};
|
||||
use std::{process::Command, sync::Arc};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use inquire::Confirm;
|
||||
|
||||
@@ -3,6 +3,7 @@ mod host_binding;
|
||||
mod http;
|
||||
mod k8s_anywhere;
|
||||
mod localhost;
|
||||
pub mod oberservability;
|
||||
pub mod tenant;
|
||||
pub use k8s_anywhere::*;
|
||||
pub use localhost::*;
|
||||
|
||||
1
harmony/src/domain/topology/oberservability/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod monitoring;
|
||||
31
harmony/src/domain/topology/oberservability/monitoring.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use async_trait::async_trait;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use url::Url;
|
||||
|
||||
use crate::interpret::InterpretError;
|
||||
|
||||
use crate::{interpret::Outcome, topology::Topology};
|
||||
|
||||
/// Represents an entity responsible for collecting and organizing observability data
|
||||
/// from various telemetry sources
|
||||
/// A `Monitor` abstracts the logic required to scrape, aggregate, and structure
|
||||
|
johnride marked this conversation as resolved
Outdated
|
||||
/// monitoring data, enabling consistent processing regardless of the underlying data source.
|
||||
|
johnride
commented
This rustdoc is pretty good to explain the abstraction, but giving a concrete example such as Prometheus would make a complete understanding much easier. Even I still have a bit of trouble to figure it out as-is. This rustdoc is pretty good to explain the abstraction, but giving a concrete example such as Prometheus would make a complete understanding much easier. Even I still have a bit of trouble to figure it out as-is.
|
||||
#[async_trait]
|
||||
pub trait Monitor<T: Topology>: Debug + Send + Sync {
|
||||
async fn deploy_monitor(
|
||||
&self,
|
||||
topology: &T,
|
||||
alert_receivers: Vec<AlertReceiver>,
|
||||
) -> Result<Outcome, InterpretError>;
|
||||
|
||||
async fn delete_monitor(
|
||||
&self,
|
||||
topolgy: &T,
|
||||
alert_receivers: Vec<AlertReceiver>,
|
||||
) -> Result<Outcome, InterpretError>;
|
||||
}
|
||||
|
johnride marked this conversation as resolved
Outdated
johnride
commented
I think this trait is correct but the way it is implemented feels wrong. I understand that you currently only have the discord implementation and you interact directly with it. But I feel like there will be a use case where we will want to have simple getter and setters for common attributes of AlertChannelConfig. I don't see a 100% clear use case right now though. Food for thought. Which makes me think : is this trait required at all? Probably not. Each AlertChannel implementation will have its own config and either the Monitor or AlertChannel trait will provide standardization for identification and compilation safety. But it is probably not necessary to try to expose a config specific trait. I think this trait is correct but the way it is implemented feels wrong. I understand that you currently only have the discord implementation and you interact directly with it. But I feel like there will be a use case where we will want to have simple getter and setters for common attributes of AlertChannelConfig. I don't see a 100% clear use case right now though.
Food for thought.
Which makes me think : is this trait required at all? Probably not. Each AlertChannel implementation will have its own config and either the Monitor or AlertChannel trait will provide standardization for identification and compilation safety. But it is probably not necessary to try to expose a **config** specific trait.
|
||||
|
||||
pub struct AlertReceiver {
|
||||
pub receiver_id: String,
|
||||
|
johnride marked this conversation as resolved
Outdated
johnride
commented
I don't think webhook url deserves to be in this abstract trait. It is possible that the AlertChannel does not have a webhook, such as SMS, phone calls, SMTP. I don't think webhook url deserves to be in this abstract trait. It is possible that the AlertChannel does not have a webhook, such as SMS, phone calls, SMTP.
|
||||
}
|
||||
@@ -1,30 +1,25 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use url::Url;
|
||||
|
||||
use crate::modules::helm::chart::HelmChartScore;
|
||||
|
||||
use super::{config::KubePrometheusConfig, monitoring_alerting::AlertChannel};
|
||||
|
||||
fn get_discord_alert_manager_score(config: &KubePrometheusConfig) -> Option<HelmChartScore> {
|
||||
let (url, name) = config.alert_channel.iter().find_map(|channel| {
|
||||
if let AlertChannel::Discord { webhook_url, name } = channel {
|
||||
Some((webhook_url, name))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})?;
|
||||
|
||||
pub fn discord_alert_manager_score(
|
||||
webhook_url: Url,
|
||||
namespace: String,
|
||||
name: String,
|
||||
) -> HelmChartScore {
|
||||
let values = format!(
|
||||
r#"
|
||||
environment:
|
||||
- name: "DISCORD_WEBHOOK"
|
||||
value: "{url}"
|
||||
value: "{webhook_url}"
|
||||
"#,
|
||||
);
|
||||
|
||||
Some(HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
|
||||
HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(&namespace).unwrap()),
|
||||
release_name: NonBlankString::from_str(&name).unwrap(),
|
||||
chart_name: NonBlankString::from_str(
|
||||
"oci://hub.nationtech.io/library/alertmanager-discord",
|
||||
@@ -36,13 +31,5 @@ environment:
|
||||
create_namespace: true,
|
||||
install_only: true,
|
||||
repository: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn discord_alert_manager_score(config: &KubePrometheusConfig) -> HelmChartScore {
|
||||
if let Some(chart) = get_discord_alert_manager_score(config) {
|
||||
chart
|
||||
} else {
|
||||
panic!("Expected discord alert manager helm chart");
|
||||
}
|
||||
}
|
||||
|
||||
55
harmony/src/modules/monitoring/discord_webhook_sender.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use async_trait::async_trait;
|
||||
use serde_json::Value;
|
||||
use url::Url;
|
||||
|
||||
use crate::{
|
||||
interpret::{InterpretError, Outcome},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiscordWebhookConfig {
|
||||
pub webhook_url: Url,
|
||||
pub name: String,
|
||||
pub send_resolved_notifications: bool,
|
||||
}
|
||||
|
||||
pub trait DiscordWebhookReceiver {
|
||||
fn deploy_discord_webhook_receiver(
|
||||
&self,
|
||||
_notification_adapter_id: &str,
|
||||
) -> Result<Outcome, InterpretError>;
|
||||
|
||||
fn delete_discord_webhook_receiver(
|
||||
&self,
|
||||
_notification_adapter_id: &str,
|
||||
) -> Result<Outcome, InterpretError>;
|
||||
}
|
||||
|
||||
// trait used to generate alert manager config values impl<T: Topology + AlertManagerConfig> Monitor for KubePrometheus
|
||||
|
johnride marked this conversation as resolved
Outdated
johnride
commented
You don't have to specify Topology here. I think it's better to keep the trait bindings to the minimum. DiscordWebhookSender should be enough. You don't have to specify Topology here. I think it's better to keep the trait bindings to the minimum. DiscordWebhookSender should be enough.
|
||||
pub trait AlertManagerConfig<T> {
|
||||
fn get_alert_manager_config(&self) -> Result<Value, InterpretError>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: DiscordWebhookReceiver> AlertManagerConfig<T> for DiscordWebhookConfig {
|
||||
fn get_alert_manager_config(&self) -> Result<Value, InterpretError> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DiscordWebhookReceiver for K8sAnywhereTopology {
|
||||
fn deploy_discord_webhook_receiver(
|
||||
|
johnride marked this conversation as resolved
Outdated
johnride
commented
Nice 👍 We probably will have to add a mechanism for Topologies so that dependencies can register their installation mechanisms themselves to make sure they are initialized properly when maestro initializes the topology. But that should not be too hard and is out of scope of this p-r. Nice 👍
We probably will have to add a mechanism for Topologies so that dependencies can register their installation mechanisms themselves to make sure they are initialized properly when maestro initializes the topology. But that should not be too hard and is out of scope of this p-r.
|
||||
&self,
|
||||
_notification_adapter_id: &str,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
todo!()
|
||||
}
|
||||
fn delete_discord_webhook_receiver(
|
||||
&self,
|
||||
_notification_adapter_id: &str,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
mod config;
|
||||
mod discord_alert_manager;
|
||||
pub mod discord_webhook_sender;
|
||||
mod kube_prometheus;
|
||||
pub mod monitoring_alerting;
|
||||
|
||||
@@ -96,28 +96,28 @@ impl MonitoringAlertingStackInterpret {
|
||||
topology: &T,
|
||||
config: &KubePrometheusConfig,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut outcomes = vec![];
|
||||
//let mut outcomes = vec![];
|
||||
|
||||
for channel in &self.score.alert_channel {
|
||||
let outcome = match channel {
|
||||
AlertChannel::Discord { .. } => {
|
||||
discord_alert_manager_score(config)
|
||||
.create_interpret()
|
||||
.execute(inventory, topology)
|
||||
.await
|
||||
}
|
||||
AlertChannel::Slack { .. } => Ok(Outcome::success(
|
||||
"No extra configs for slack alerting".to_string(),
|
||||
)),
|
||||
AlertChannel::Smpt { .. } => {
|
||||
todo!()
|
||||
}
|
||||
};
|
||||
outcomes.push(outcome);
|
||||
}
|
||||
for result in outcomes {
|
||||
result?;
|
||||
}
|
||||
//for channel in &self.score.alert_channel {
|
||||
// let outcome = match channel {
|
||||
// AlertChannel::Discord { .. } => {
|
||||
// discord_alert_manager_score(config)
|
||||
// .create_interpret()
|
||||
// .execute(inventory, topology)
|
||||
// .await
|
||||
// }
|
||||
// AlertChannel::Slack { .. } => Ok(Outcome::success(
|
||||
// "No extra configs for slack alerting".to_string(),
|
||||
// )),
|
||||
// AlertChannel::Smpt { .. } => {
|
||||
// todo!()
|
||||
// }
|
||||
// };
|
||||
// outcomes.push(outcome);
|
||||
//}
|
||||
//for result in outcomes {
|
||||
// result?;
|
||||
//}
|
||||
|
||||
Ok(Outcome::success("All alert channels deployed".to_string()))
|
||||
}
|
||||
|
||||
A bit of rustdoc would be relevant here to explain the intended use of this trait.