wip:added alertreceiver and alert rules which are built and added to the yaml before deploying prometheus, added a few dashboards to grafana. Trying to fix prometheus-server clusterrole/role/serviceaccount so that it can discover targets and kubernetes in a namespaced release where it does not have access to clusterrole
This commit is contained in:
parent
31661aaaf1
commit
5c628b37b7
@ -3,9 +3,18 @@ use std::{path::PathBuf, sync::Arc};
|
|||||||
use harmony::{
|
use harmony::{
|
||||||
inventory::Inventory,
|
inventory::Inventory,
|
||||||
maestro::Maestro,
|
maestro::Maestro,
|
||||||
modules::application::{
|
modules::{
|
||||||
ApplicationScore, RustWebFramework, RustWebapp,
|
application::{
|
||||||
features::{ContinuousDelivery, Monitoring},
|
ApplicationScore, RustWebFramework, RustWebapp,
|
||||||
|
features::{ContinuousDelivery, PrometheusMonitoring},
|
||||||
|
},
|
||||||
|
monitoring::{
|
||||||
|
alert_channel::discord_alert_channel::DiscordWebhook,
|
||||||
|
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
|
||||||
|
},
|
||||||
|
prometheus::alerts::k8s::{
|
||||||
|
pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
topology::{K8sAnywhereTopology, Url},
|
topology::{K8sAnywhereTopology, Url},
|
||||||
};
|
};
|
||||||
@ -20,12 +29,25 @@ async fn main() {
|
|||||||
framework: Some(RustWebFramework::Leptos),
|
framework: Some(RustWebFramework::Leptos),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let pod_failed = pod_in_failed_state();
|
||||||
|
let pod_failed_2 = pod_in_failed_state();
|
||||||
|
let pod_failed_3 = pod_in_failed_state();
|
||||||
|
|
||||||
|
let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]);
|
||||||
|
let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]);
|
||||||
let app = ApplicationScore {
|
let app = ApplicationScore {
|
||||||
features: vec![
|
features: vec![
|
||||||
Box::new(ContinuousDelivery {
|
//Box::new(ContinuousDelivery {
|
||||||
|
// application: application.clone(),
|
||||||
|
//}),
|
||||||
|
Box::new(PrometheusMonitoring {
|
||||||
application: application.clone(),
|
application: application.clone(),
|
||||||
|
alert_receivers: vec![Box::new(DiscordWebhook {
|
||||||
|
name: "dummy-discord".to_string(),
|
||||||
|
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
|
||||||
|
})],
|
||||||
|
alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)],
|
||||||
}),
|
}),
|
||||||
Box::new(Monitoring {}),
|
|
||||||
// TODO add monitoring, backups, multisite ha, etc
|
// TODO add monitoring, backups, multisite ha, etc
|
||||||
],
|
],
|
||||||
application,
|
application,
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
use std::any::Any;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
|
||||||
@ -9,7 +11,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
pub trait AlertSender: Any + Send + Sync + std::fmt::Debug {
|
||||||
fn name(&self) -> String;
|
fn name(&self) -> String;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,32 +9,36 @@ use crate::{
|
|||||||
application::{Application, ApplicationFeature},
|
application::{Application, ApplicationFeature},
|
||||||
monitoring::{
|
monitoring::{
|
||||||
application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore,
|
application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore,
|
||||||
kube_prometheus::types::{NamespaceSelector, ServiceMonitor},
|
kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{HelmCommand, Topology, tenant::TenantManager},
|
topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Monitoring {
|
pub struct PrometheusMonitoring {
|
||||||
pub application: Arc<dyn Application>,
|
pub application: Arc<dyn Application>,
|
||||||
|
pub alert_receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
||||||
|
pub alert_rules: Vec<Box<dyn AlertRule<Prometheus>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for Monitoring {
|
impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for PrometheusMonitoring {
|
||||||
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
|
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
|
||||||
info!("Ensuring monitoring is available for application");
|
info!("Ensuring monitoring is available for application");
|
||||||
let ns = self.application.name();
|
let ns = self.application.name();
|
||||||
let mut service_monitor = ServiceMonitor::default();
|
let mut service_monitor = ServiceMonitor::default();
|
||||||
|
service_monitor.name = ns.clone();
|
||||||
|
service_monitor.namespace = ns.clone();
|
||||||
service_monitor.namespace_selector = Some(NamespaceSelector {
|
service_monitor.namespace_selector = Some(NamespaceSelector {
|
||||||
any: true,
|
any: true,
|
||||||
match_names: vec![ns.clone()],
|
match_names: vec![ns.clone()],
|
||||||
});
|
});
|
||||||
let alerting_score = ApplicationPrometheusMonitoringScore {
|
let alerting_score = ApplicationPrometheusMonitoringScore {
|
||||||
namespace: ns,
|
namespace: ns,
|
||||||
receivers: vec![],
|
receivers: self.alert_receivers.clone(),
|
||||||
rules: vec![],
|
rules: self.alert_rules.clone(),
|
||||||
service_monitors: vec![service_monitor],
|
service_monitors: vec![service_monitor],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -220,7 +220,6 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret {
|
|||||||
yaml_path,
|
yaml_path,
|
||||||
Some(&helm_options),
|
Some(&helm_options),
|
||||||
);
|
);
|
||||||
|
|
||||||
let status = match res {
|
let status = match res {
|
||||||
Ok(status) => status,
|
Ok(status) => status,
|
||||||
Err(err) => return Err(InterpretError::new(err.to_string())),
|
Err(err) => return Err(InterpretError::new(err.to_string())),
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
use std::any::Any;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_yaml::{Mapping, Value};
|
use serde_yaml::{Mapping, Value};
|
||||||
@ -11,7 +13,10 @@ use crate::{
|
|||||||
},
|
},
|
||||||
prometheus::prometheus::{Prometheus, PrometheusReceiver},
|
prometheus::prometheus::{Prometheus, PrometheusReceiver},
|
||||||
},
|
},
|
||||||
topology::{Url, oberservability::monitoring::AlertReceiver},
|
topology::{
|
||||||
|
Url,
|
||||||
|
oberservability::monitoring::{AlertReceiver, AlertSender},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
@ -6,13 +6,11 @@ use serde::Serialize;
|
|||||||
use crate::{
|
use crate::{
|
||||||
modules::monitoring::{
|
modules::monitoring::{
|
||||||
kube_prometheus::types::ServiceMonitor,
|
kube_prometheus::types::ServiceMonitor,
|
||||||
prometheus::{prometheus::Prometheus, prometheus_config::PrometheusConfig},
|
prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig},
|
||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{
|
topology::{
|
||||||
HelmCommand, Topology,
|
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology
|
||||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
|
|
||||||
tenant::TenantManager,
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -26,7 +24,7 @@ pub struct ApplicationPrometheusMonitoringScore {
|
|||||||
|
|
||||||
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
|
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
|
||||||
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
||||||
let config = Arc::new(Mutex::new(PrometheusConfig::new()));
|
let config = Arc::new(Mutex::new(HelmPrometheusConfig::new()));
|
||||||
config
|
config
|
||||||
.try_lock()
|
.try_lock()
|
||||||
.expect("couldn't lock config")
|
.expect("couldn't lock config")
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
use non_blank_string_rs::NonBlankString;
|
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use non_blank_string_rs::NonBlankString;
|
||||||
use crate::modules::helm::chart::HelmChartScore;
|
use crate::modules::helm::chart::HelmChartScore;
|
||||||
|
|
||||||
pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
|
pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
|
||||||
@ -8,35 +7,46 @@ pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
|
|||||||
r#"
|
r#"
|
||||||
rbac:
|
rbac:
|
||||||
namespaced: true
|
namespaced: true
|
||||||
|
|
||||||
datasources:
|
datasources:
|
||||||
datasources.yaml:
|
datasources.yaml:
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
datasources:
|
datasources:
|
||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://prometheus-server.{ns}.svc.cluster.local
|
url: http://prometheus-server.{ns}.svc.cluster.local
|
||||||
isDefault: true
|
isDefault: true
|
||||||
downloadDashboards:
|
|
||||||
dashboards:
|
|
||||||
- url: https://raw.githubusercontent.com/grafana/grafana/main/devenv/dev-dashboards/node-exporter-full_rev1.json
|
|
||||||
file: node-exporter-full.json
|
|
||||||
|
|
||||||
- url: https://grafana.com/api/dashboards/7685/revisions/1/download
|
|
||||||
file: kubernetes-pvs-usage.json
|
|
||||||
|
|
||||||
# Namespace resource usage vs quotas
|
dashboardProviders:
|
||||||
- url: https://grafana.com/api/dashboards/17044/revisions/1/download
|
dashboardproviders.yaml:
|
||||||
file: namespace-resources-vs-quotas.json
|
apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: 'default'
|
||||||
|
orgId: 1
|
||||||
|
folder: ''
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 10
|
||||||
|
allowUiUpdates: true
|
||||||
|
editable: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/default
|
||||||
|
|
||||||
# Kubernetes namespace resources (CPU, RAM, network)
|
dashboards:
|
||||||
- url: https://grafana.com/api/dashboards/9809/revisions/1/download
|
default:
|
||||||
file: kubernetes-namespace-resources.json
|
compute-usage:
|
||||||
|
url: https://grafana.com/api/dashboards/315/revisions/1/download
|
||||||
# Top 10 namespaces by memory usage
|
pod-health:
|
||||||
- url: https://grafana.com/api/dashboards/10678/revisions/1/download
|
url: https://grafana.com/api/dashboards/15758/revisions/1/download
|
||||||
file: top10-namespace-memory.json
|
namespace-resources:
|
||||||
"#
|
url: https://grafana.com/api/dashboards/9809/revisions/1/download
|
||||||
|
namespace-resources-vs-quotas:
|
||||||
|
url: https://grafana.com/api/dashboards/17044/revisions/1/download
|
||||||
|
persistent-volume-usage:
|
||||||
|
url: https://grafana.com/api/dashboards/7685/revisions/1/download
|
||||||
|
"#,
|
||||||
|
ns = ns
|
||||||
);
|
);
|
||||||
|
|
||||||
HelmChartScore {
|
HelmChartScore {
|
||||||
@ -45,9 +55,10 @@ downloadDashboards:
|
|||||||
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(),
|
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(),
|
||||||
chart_version: None,
|
chart_version: None,
|
||||||
values_overrides: None,
|
values_overrides: None,
|
||||||
values_yaml: Some(values.to_string()),
|
values_yaml: Some(values),
|
||||||
create_namespace: true,
|
create_namespace: true,
|
||||||
install_only: false,
|
install_only: false,
|
||||||
repository: None,
|
repository: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -211,6 +211,8 @@ pub struct Selector {
|
|||||||
pub struct ServiceMonitor {
|
pub struct ServiceMonitor {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
|
|
||||||
|
pub namespace: String,
|
||||||
|
|
||||||
// # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
|
// # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
|
||||||
pub additional_labels: Option<HashMap<String, String>>,
|
pub additional_labels: Option<HashMap<String, String>>,
|
||||||
|
|
||||||
@ -261,6 +263,7 @@ impl Default for ServiceMonitor {
|
|||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
name: Default::default(),
|
name: Default::default(),
|
||||||
|
namespace: Default::default(),
|
||||||
additional_labels: Default::default(),
|
additional_labels: Default::default(),
|
||||||
job_label: Default::default(),
|
job_label: Default::default(),
|
||||||
target_labels: Default::default(),
|
target_labels: Default::default(),
|
||||||
|
@ -1 +1,2 @@
|
|||||||
pub mod prometheus_helm;
|
pub mod prometheus_helm;
|
||||||
|
pub mod types;
|
||||||
|
@ -1,56 +1,145 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use log::debug;
|
||||||
use non_blank_string_rs::NonBlankString;
|
use non_blank_string_rs::NonBlankString;
|
||||||
|
use serde_yaml::{Mapping, Value};
|
||||||
|
|
||||||
use crate::modules::{
|
use crate::modules::helm::chart::HelmChartScore;
|
||||||
helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig,
|
use crate::modules::monitoring::kube_prometheus::types::{
|
||||||
|
AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec,
|
||||||
|
ConfigReloader, Limits, Requests, Resources,
|
||||||
};
|
};
|
||||||
|
use crate::modules::monitoring::prometheus::helm::types::{
|
||||||
|
AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor,
|
||||||
|
Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig,
|
||||||
|
};
|
||||||
|
use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig;
|
||||||
|
|
||||||
pub fn prometheus_helm_chart_score(config: Arc<Mutex<PrometheusConfig>>) -> HelmChartScore {
|
pub fn prometheus_helm_chart_score(config: Arc<Mutex<HelmPrometheusConfig>>) -> HelmChartScore {
|
||||||
let config = config.lock().unwrap();
|
let config = config.lock().unwrap();
|
||||||
let ns = config.namespace.clone().unwrap();
|
let ns = config.namespace.clone().unwrap();
|
||||||
let values = format!(
|
|
||||||
r#"
|
|
||||||
rbac:
|
|
||||||
create: false
|
|
||||||
kube-state-metrics:
|
|
||||||
enabled: false
|
|
||||||
prometheus-node-exporter:
|
|
||||||
enabled: false
|
|
||||||
prometheus-pushgateway:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
server:
|
let rbac_config = RbacConfig { create: false };
|
||||||
releaseNamespace: true
|
|
||||||
clusterRole: false
|
|
||||||
clusterRoleBinding: false
|
|
||||||
rbac:
|
|
||||||
create: true
|
|
||||||
namespaced: true
|
|
||||||
|
|
||||||
serverFiles:
|
let ksm_config = KubeStateMetricsConfig {
|
||||||
prometheus.yml:
|
enabled: true,
|
||||||
scrape_configs:
|
rbac: KsmRbacConfig {
|
||||||
- job_name: 'prometheus'
|
use_cluster_role: false,
|
||||||
static_configs:
|
},
|
||||||
- targets: ['localhost:9090']
|
prometheus: Prometheus {
|
||||||
serviceMonitorNamespaceSelector:
|
monitor: Monitor { enabled: true },
|
||||||
matchLabels:
|
},
|
||||||
kubernetes.io/metadata.name: {ns}
|
};
|
||||||
podMonitorNamespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
kubernetes.io/metadata.name: {ns}
|
|
||||||
|
|
||||||
alertmanager:
|
let mut selector_labels = BTreeMap::new();
|
||||||
enabled: true
|
selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone());
|
||||||
rbac:
|
let mut kube_state_metrics_labels = BTreeMap::new();
|
||||||
create: true
|
kube_state_metrics_labels.insert(
|
||||||
namespaced: true
|
"app.kubernetes.io/name".to_string(),
|
||||||
"#
|
"kube-state-metrics".to_string(),
|
||||||
);
|
);
|
||||||
|
let selector = LabelSelector {
|
||||||
|
match_labels: selector_labels,
|
||||||
|
};
|
||||||
|
|
||||||
|
let server_config = ServerConfig {
|
||||||
|
namespaces: vec![ns.clone()],
|
||||||
|
use_existing_cluster_role_name: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut null_receiver = Mapping::new();
|
||||||
|
null_receiver.insert(
|
||||||
|
Value::String("receiver".to_string()),
|
||||||
|
Value::String("default-receiver".to_string()),
|
||||||
|
);
|
||||||
|
null_receiver.insert(
|
||||||
|
Value::String("matchers".to_string()),
|
||||||
|
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
|
||||||
|
);
|
||||||
|
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
|
||||||
|
|
||||||
|
let mut alert_manager_channel_config = AlertManagerConfig {
|
||||||
|
global: Mapping::new(),
|
||||||
|
route: AlertManagerRoute {
|
||||||
|
routes: vec![Value::Mapping(null_receiver)],
|
||||||
|
},
|
||||||
|
receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()],
|
||||||
|
};
|
||||||
|
for receiver in config.alert_receiver_configs.iter() {
|
||||||
|
if let Some(global) = receiver.channel_global_config.clone() {
|
||||||
|
alert_manager_channel_config
|
||||||
|
.global
|
||||||
|
.insert(global.0, global.1);
|
||||||
|
}
|
||||||
|
alert_manager_channel_config
|
||||||
|
.route
|
||||||
|
.routes
|
||||||
|
.push(receiver.channel_route.clone());
|
||||||
|
alert_manager_channel_config
|
||||||
|
.receivers
|
||||||
|
.push(receiver.channel_receiver.clone());
|
||||||
|
}
|
||||||
|
let alert_manager_values = AlertManager {
|
||||||
|
enabled: config.alert_manager,
|
||||||
|
config: alert_manager_channel_config,
|
||||||
|
alertmanager_spec: AlertManagerSpec {
|
||||||
|
resources: Resources {
|
||||||
|
limits: Limits {
|
||||||
|
memory: "100Mi".to_string(),
|
||||||
|
cpu: "100m".to_string(),
|
||||||
|
},
|
||||||
|
requests: Requests {
|
||||||
|
memory: "100Mi".to_string(),
|
||||||
|
cpu: "100m".to_string(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
init_config_reloader: ConfigReloader {
|
||||||
|
resources: Resources {
|
||||||
|
limits: Limits {
|
||||||
|
memory: "100Mi".to_string(),
|
||||||
|
cpu: "100m".to_string(),
|
||||||
|
},
|
||||||
|
requests: Requests {
|
||||||
|
memory: "100Mi".to_string(),
|
||||||
|
cpu: "100m".to_string(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut result: BTreeMap<String, AlertFile> = BTreeMap::new();
|
||||||
|
for rule in config.alert_rules.clone().iter() {
|
||||||
|
for (name, group) in &rule.rules {
|
||||||
|
result
|
||||||
|
.entry("alerting_rules.yml".to_string())
|
||||||
|
.and_modify(|e| e.groups.extend(group.groups.clone()))
|
||||||
|
.or_insert(AlertFile {
|
||||||
|
groups: group.groups.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let final_values = PrometheusHelmValues {
|
||||||
|
rbac: rbac_config,
|
||||||
|
kube_state_metrics: ksm_config,
|
||||||
|
server: server_config,
|
||||||
|
alertmanager: alert_manager_values,
|
||||||
|
server_files: result,
|
||||||
|
additional_service_monitors: config.additional_service_monitors.clone(),
|
||||||
|
prometheus_node_exporter: EnabledConfig { enabled: false },
|
||||||
|
prometheus_pushgateway: EnabledConfig { enabled: false },
|
||||||
|
};
|
||||||
|
|
||||||
|
let values_yaml =
|
||||||
|
serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values");
|
||||||
|
|
||||||
|
debug!("full values.yaml: \n{}", values_yaml);
|
||||||
|
|
||||||
HelmChartScore {
|
HelmChartScore {
|
||||||
namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()),
|
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
|
||||||
release_name: NonBlankString::from_str("prometheus").unwrap(),
|
release_name: NonBlankString::from_str("prometheus").unwrap(),
|
||||||
chart_name: NonBlankString::from_str(
|
chart_name: NonBlankString::from_str(
|
||||||
"oci://ghcr.io/prometheus-community/charts/prometheus",
|
"oci://ghcr.io/prometheus-community/charts/prometheus",
|
||||||
@ -58,9 +147,9 @@ alertmanager:
|
|||||||
.unwrap(),
|
.unwrap(),
|
||||||
chart_version: None,
|
chart_version: None,
|
||||||
values_overrides: None,
|
values_overrides: None,
|
||||||
values_yaml: Some(values.to_string()),
|
values_yaml: Some(values_yaml),
|
||||||
create_namespace: true,
|
create_namespace: true,
|
||||||
install_only: false,
|
install_only: true,
|
||||||
repository: None,
|
repository: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
94
harmony/src/modules/monitoring/prometheus/helm/types.rs
Normal file
94
harmony/src/modules/monitoring/prometheus/helm/types.rs
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{
|
||||||
|
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor
|
||||||
|
}};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct RuleFilesConfig {
|
||||||
|
#[serde(rename = "ruleFiles")]
|
||||||
|
pub files: BTreeMap<String, AlertGroup>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct PrometheusHelmValues {
|
||||||
|
pub rbac: RbacConfig,
|
||||||
|
#[serde(rename = "kube-state-metrics")]
|
||||||
|
pub kube_state_metrics: KubeStateMetricsConfig,
|
||||||
|
pub server: ServerConfig,
|
||||||
|
pub alertmanager: AlertManager, // You already have this
|
||||||
|
#[serde(rename = "serverFiles")]
|
||||||
|
pub server_files: BTreeMap<String, AlertFile>, // You already have this
|
||||||
|
pub additional_service_monitors: Vec<ServiceMonitor>, // You already have this
|
||||||
|
#[serde(rename = "prometheus-node-exporter")]
|
||||||
|
pub prometheus_node_exporter: EnabledConfig,
|
||||||
|
#[serde(rename = "prometheus-pushgateway")]
|
||||||
|
pub prometheus_pushgateway: EnabledConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug, Clone)]
|
||||||
|
pub struct AlertFile {
|
||||||
|
pub groups: Vec<AlertManagerRuleGroup>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct RbacConfig {
|
||||||
|
pub create: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct KubeStateMetricsConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub rbac: KsmRbacConfig,
|
||||||
|
pub prometheus: Prometheus,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct Prometheus {
|
||||||
|
pub monitor: Monitor
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct Monitor{
|
||||||
|
pub enabled: bool
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct KsmRbacConfig {
|
||||||
|
pub use_cluster_role: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ServerConfig {
|
||||||
|
pub namespaces: Vec<String>,
|
||||||
|
pub use_existing_cluster_role_name: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ServerRbacConfig {
|
||||||
|
pub create: bool,
|
||||||
|
pub use_cluster_role: bool,
|
||||||
|
pub namespaced: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct LabelSelector {
|
||||||
|
#[serde(rename = "matchLabels")]
|
||||||
|
pub match_labels: BTreeMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
pub struct EnabledConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
}
|
@ -14,7 +14,7 @@ use crate::{
|
|||||||
},
|
},
|
||||||
score::Score,
|
score::Score,
|
||||||
topology::{
|
topology::{
|
||||||
HelmCommand, Topology,
|
HelmCommand, K8sclient, Topology,
|
||||||
installable::Installable,
|
installable::Installable,
|
||||||
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
|
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
|
||||||
tenant::TenantManager,
|
tenant::TenantManager,
|
||||||
@ -22,12 +22,12 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig,
|
helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Prometheus {
|
pub struct Prometheus {
|
||||||
pub config: Arc<Mutex<PrometheusConfig>>,
|
pub config: Arc<Mutex<HelmPrometheusConfig>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -40,7 +40,7 @@ impl AlertSender for Prometheus {
|
|||||||
impl Prometheus {
|
impl Prometheus {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
config: Arc::new(Mutex::new(PrometheusConfig::new())),
|
config: Arc::new(Mutex::new(HelmPrometheusConfig::new())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
|
pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
|
||||||
|
@ -3,9 +3,8 @@ use crate::modules::monitoring::kube_prometheus::types::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct PrometheusConfig {
|
pub struct HelmPrometheusConfig {
|
||||||
pub namespace: Option<String>,
|
pub namespace: Option<String>,
|
||||||
pub default_rules: bool,
|
|
||||||
pub alert_manager: bool,
|
pub alert_manager: bool,
|
||||||
pub node_exporter: bool,
|
pub node_exporter: bool,
|
||||||
pub kube_state_metrics: bool,
|
pub kube_state_metrics: bool,
|
||||||
@ -16,11 +15,10 @@ pub struct PrometheusConfig {
|
|||||||
pub additional_service_monitors: Vec<ServiceMonitor>,
|
pub additional_service_monitors: Vec<ServiceMonitor>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PrometheusConfig {
|
impl HelmPrometheusConfig {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
namespace: None,
|
namespace: None,
|
||||||
default_rules: true,
|
|
||||||
alert_manager: true,
|
alert_manager: true,
|
||||||
node_exporter: false,
|
node_exporter: false,
|
||||||
kube_state_metrics: false,
|
kube_state_metrics: false,
|
||||||
|
@ -1 +1,2 @@
|
|||||||
pub mod pvc;
|
pub mod pvc;
|
||||||
|
pub mod pod;
|
||||||
|
38
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
38
harmony/src/modules/prometheus/alerts/k8s/pod.rs
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||||
|
|
||||||
|
pub fn pod_in_failed_state() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new(
|
||||||
|
"PodInFailedState",
|
||||||
|
// This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true).
|
||||||
|
"kube_pod_status_phase{phase=\"Failed\"} == 1",
|
||||||
|
)
|
||||||
|
.for_duration("1m") // Fire if the pod is in this state for 1 minute.
|
||||||
|
.label("severity", "critical") // A failed pod is a critical issue.
|
||||||
|
.annotation(
|
||||||
|
"summary",
|
||||||
|
"Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.",
|
||||||
|
)
|
||||||
|
.annotation(
|
||||||
|
"description",
|
||||||
|
"The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn pod_restarting_frequently() -> PrometheusAlertRule {
|
||||||
|
PrometheusAlertRule::new(
|
||||||
|
"PodRestartingFrequently",
|
||||||
|
// This expression calculates the increase in the restart count over the last 30 minutes.
|
||||||
|
// Alert if a container has restarted more than 5 times.
|
||||||
|
"increase(kube_pod_container_status_restarts_total[30m]) > 5",
|
||||||
|
)
|
||||||
|
.for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps.
|
||||||
|
.label("severity", "critical") // A crash-looping pod is effectively down.
|
||||||
|
.annotation(
|
||||||
|
"summary",
|
||||||
|
"Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.",
|
||||||
|
)
|
||||||
|
.annotation(
|
||||||
|
"description",
|
||||||
|
"The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.",
|
||||||
|
)
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user