refactor/ns #74

Merged
wjro merged 8 commits from refactor/ns into master 2025-07-02 19:54:33 +00:00
14 changed files with 502 additions and 67 deletions

11
Cargo.lock generated
View File

@ -1300,6 +1300,17 @@ dependencies = [
"url",
]
[[package]]
name = "example-monitoring-with-tenant"
version = "0.1.0"
dependencies = [
"cidr",
"harmony",
"harmony_cli",
"tokio",
"url",
]
[[package]]
name = "example-nanodc"
version = "0.1.0"

View File

@ -7,7 +7,13 @@ use harmony::{
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
kube_prometheus::helm_prometheus_alert_score::HelmPrometheusAlertingScore,
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{
HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
ServiceMonitorEndpoint,
},
},
},
prometheus::alerts::{
infra::dell_server::{

View File

@ -0,0 +1,13 @@
[package]
name = "example-monitoring-with-tenant"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
cidr.workspace = true
harmony = { version = "0.1.0", path = "../../harmony" }
harmony_cli = { version = "0.1.0", path = "../../harmony_cli" }
tokio.workspace = true
url.workspace = true

View File

@ -0,0 +1,90 @@
use std::collections::HashMap;
use harmony::{
data::Id,
inventory::Inventory,
maestro::Maestro,
modules::{
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
kube_prometheus::{
helm_prometheus_alert_score::HelmPrometheusAlertingScore,
types::{
HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
ServiceMonitorEndpoint,
},
},
},
prometheus::alerts::k8s::pvc::high_pvc_fill_rate_over_two_days,
tenant::TenantScore,
},
topology::{
K8sAnywhereTopology, Url,
tenant::{ResourceLimits, TenantConfig, TenantNetworkPolicy},
},
};
#[tokio::main]
async fn main() {
let tenant = TenantScore {
config: TenantConfig {
id: Id::from_string("1234".to_string()),
name: "test-tenant".to_string(),
resource_limits: ResourceLimits {
cpu_request_cores: 6.0,
cpu_limit_cores: 4.0,
memory_request_gb: 4.0,
memory_limit_gb: 4.0,
storage_total_gb: 10.0,
},
network_policy: TenantNetworkPolicy::default(),
},
};
let discord_receiver = DiscordWebhook {
name: "test-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
};
let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
let additional_rules =
AlertManagerRuleGroup::new("pvc-alerts", vec![high_pvc_fill_rate_over_two_days_alert]);
let service_monitor_endpoint = ServiceMonitorEndpoint {
port: Some("80".to_string()),
path: "/metrics".to_string(),
scheme: HTTPScheme::HTTP,
..Default::default()
};
let service_monitor = ServiceMonitor {
name: "test-service-monitor".to_string(),
selector: Selector {
match_labels: HashMap::new(),
match_expressions: vec![MatchExpression {
key: "test".to_string(),
operator: Operator::In,
values: vec!["test-service".to_string()],
}],
},
endpoints: vec![service_monitor_endpoint],
..Default::default()
};
let alerting_score = HelmPrometheusAlertingScore {
receivers: vec![Box::new(discord_receiver)],
rules: vec![Box::new(additional_rules)],
service_monitors: vec![service_monitor],
};
let mut maestro = Maestro::<K8sAnywhereTopology>::initialize(
Inventory::autoload(),
K8sAnywhereTopology::from_env(),
)
.await
.unwrap();
maestro.register_all(vec![Box::new(tenant), Box::new(alerting_score)]);
harmony_cli::init(maestro, None).await.unwrap();
}

View File

@ -4,6 +4,8 @@ use crate::{interpret::InterpretError, inventory::Inventory};
#[async_trait]
pub trait Installable<T>: Send + Sync {
async fn configure(&self, inventory: &Inventory, topology: &T) -> Result<(), InterpretError>;
async fn ensure_installed(
&self,
inventory: &Inventory,

View File

@ -196,8 +196,7 @@ impl K8sAnywhereTopology {
let k8s_client = self.k8s_client().await?;
Ok(K8sTenantManager::new(k8s_client))
})
.await
.unwrap();
.await?;
Ok(())
}
@ -285,4 +284,11 @@ impl TenantManager for K8sAnywhereTopology {
.provision_tenant(config)
.await
}
async fn get_tenant_config(&self) -> Option<TenantConfig> {
self.get_k8s_tenant_manager()
.ok()?
.get_tenant_config()
.await
}
}

View File

@ -27,6 +27,7 @@ impl<S: AlertSender + Installable<T>, T: Topology> Interpret<T> for AlertingInte
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
self.sender.configure(inventory, topology).await?;
for receiver in self.receivers.iter() {
receiver.install(&self.sender).await?;
}

View File

@ -5,7 +5,6 @@ use crate::{
topology::k8s::{ApplyStrategy, K8sClient},
};
use async_trait::async_trait;
use derive_new::new;
use k8s_openapi::{
api::{
core::v1::{LimitRange, Namespace, ResourceQuota},
@ -19,12 +18,23 @@ use kube::Resource;
use log::{debug, info, warn};
use serde::de::DeserializeOwned;
use serde_json::json;
use tokio::sync::OnceCell;
use super::{TenantConfig, TenantManager};
#[derive(new, Clone, Debug)]
#[derive(Clone, Debug)]
pub struct K8sTenantManager {
k8s_client: Arc<K8sClient>,
k8s_tenant_config: Arc<OnceCell<TenantConfig>>,
}
impl K8sTenantManager {
pub fn new(client: Arc<K8sClient>) -> Self {
Self {
k8s_client: client,
k8s_tenant_config: Arc::new(OnceCell::new()),
}
}
}
impl K8sTenantManager {
@ -112,8 +122,8 @@ impl K8sTenantManager {
"requests.storage": format!("{:.3}Gi", config.resource_limits.storage_total_gb),
"pods": "20",
"services": "10",
"configmaps": "30",
"secrets": "30",
"configmaps": "60",
"secrets": "60",
"persistentvolumeclaims": "15",
"services.loadbalancers": "2",
"services.nodeports": "5",
@ -147,7 +157,7 @@ impl K8sTenantManager {
"spec": {
"limits": [
{
"type": "Container",
"type": "Container",
"default": {
"cpu": "500m",
"memory": "500Mi"
@ -180,60 +190,94 @@ impl K8sTenantManager {
"spec": {
"podSelector": {},
"egress": [
{ "to": [ {"podSelector": {}}]},
{ "to":
[
{
"podSelector": {},
"namespaceSelector": {
"matchLabels": {
"kubernetes.io/metadata.name":"openshift-dns"
}
}
},
]
},
{ "to": [
{
"ipBlock": {
"cidr": "0.0.0.0/0",
// See https://en.wikipedia.org/wiki/Reserved_IP_addresses
"except": [
"10.0.0.0/8",
"172.16.0.0/12",
"192.168.0.0/16",
"192.0.0.0/24",
"192.0.2.0/24",
"192.88.99.0/24",
"192.18.0.0/15",
"198.51.100.0/24",
"169.254.0.0/16",
"203.0.113.0/24",
"127.0.0.0/8",
// Not sure we should block this one as it is
// used for multicast. But better block more than less.
"224.0.0.0/4",
"240.0.0.0/4",
"100.64.0.0/10",
"233.252.0.0/24",
"0.0.0.0/8",
],
}
{
"to": [
{ "podSelector": {} }
]
},
{
"to": [
{
"podSelector": {},
"namespaceSelector": {
"matchLabels": {
"kubernetes.io/metadata.name": "kube-system"
}
]
},
}
}
]
},
{
"to": [
{
"podSelector": {},
"namespaceSelector": {
"matchLabels": {
"kubernetes.io/metadata.name": "openshift-dns"
}
}
}
]
},
{
"to": [
{
"ipBlock": {
"cidr": "10.43.0.1/32",
}
}
]
},
{
"to": [
{
"ipBlock": {
"cidr": "172.23.0.0/16",
}
}
]
},
{
"to": [
{
"ipBlock": {
"cidr": "0.0.0.0/0",
"except": [
"10.0.0.0/8",
"172.16.0.0/12",
"192.168.0.0/16",
"192.0.0.0/24",
"192.0.2.0/24",
"192.88.99.0/24",
"192.18.0.0/15",
"198.51.100.0/24",
"169.254.0.0/16",
"203.0.113.0/24",
"127.0.0.0/8",
"224.0.0.0/4",
"240.0.0.0/4",
"100.64.0.0/10",
"233.252.0.0/24",
"0.0.0.0/8"
]
}
}
]
}
],
"ingress": [
{ "from": [ {"podSelector": {}}]}
{
"from": [
{ "podSelector": {} }
]
}
],
"policyTypes": [
"Ingress", "Egress",
"Ingress",
"Egress"
]
}
});
let mut network_policy: NetworkPolicy =
serde_json::from_value(network_policy).map_err(|e| {
ExecutorError::ConfigurationError(format!(
@ -357,6 +401,9 @@ impl K8sTenantManager {
Ok(network_policy)
}
fn store_config(&self, config: &TenantConfig) {
let _ = self.k8s_tenant_config.set(config.clone());
}
}
#[async_trait]
@ -385,6 +432,10 @@ impl TenantManager for K8sTenantManager {
"Success provisionning K8s tenant id {} name {}",
config.id, config.name
);
self.store_config(config);
Ok(())
}
async fn get_tenant_config(&self) -> Option<TenantConfig> {
self.k8s_tenant_config.get().cloned()
}
}

View File

@ -15,4 +15,6 @@ pub trait TenantManager {
/// # Arguments
/// * `config`: The desired configuration for the new tenant.
async fn provision_tenant(&self, config: &TenantConfig) -> Result<(), ExecutorError>;
async fn get_tenant_config(&self) -> Option<TenantConfig>;
}

View File

@ -1,12 +1,12 @@
use serde::Serialize;
use crate::modules::monitoring::kube_prometheus::types::{
AlertManagerAdditionalPromRules, AlertManagerChannelConfig,
AlertManagerAdditionalPromRules, AlertManagerChannelConfig, ServiceMonitor,
};
#[derive(Debug, Clone, Serialize)]
pub struct KubePrometheusConfig {
pub namespace: String,
pub namespace: Option<String>,
pub default_rules: bool,
pub windows_monitoring: bool,
pub alert_manager: bool,
@ -30,7 +30,7 @@ pub struct KubePrometheusConfig {
impl KubePrometheusConfig {
pub fn new() -> Self {
Self {
namespace: "monitoring".into(),
namespace: None,
default_rules: true,
windows_monitoring: false,
alert_manager: true,
@ -39,7 +39,7 @@ impl KubePrometheusConfig {
prometheus: true,
kubernetes_service_monitors: true,

FIXME only required components.

FIXME only required components.
kubernetes_api_server: false,
kubelet: false,
kubelet: true,
kube_controller_manager: false,
kube_etcd: false,
kube_proxy: false,

View File

@ -12,7 +12,8 @@ use crate::modules::{
helm::chart::HelmChartScore,
monitoring::kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig,
AlertManagerRoute, AlertManagerValues, PrometheusConfig,
AlertManagerRoute, AlertManagerSpec, AlertManagerValues, ConfigReloader, Limits,
PrometheusConfig, Requests, Resources,
},
};
@ -36,8 +37,47 @@ pub fn kube_prometheus_helm_chart_score(
let node_exporter = config.node_exporter.to_string();
let prometheus_operator = config.prometheus_operator.to_string();
let prometheus = config.prometheus.to_string();
let resource_limit = Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
};
fn indent_lines(s: &str, spaces: usize) -> String {
let pad = " ".repeat(spaces);
s.lines()
.map(|line| format!("{pad}{line}"))
.collect::<Vec<_>>()
.join("\n")
}
fn resource_block(resource: &Resources, indent_level: usize) -> String {
let yaml = serde_yaml::to_string(resource).unwrap();
format!(
"{}resources:\n{}",
" ".repeat(indent_level),
indent_lines(&yaml, indent_level + 2)
)
}
let resource_section = resource_block(&resource_limit, 2);
let mut values = format!(
r#"
prometheus:
enabled: {prometheus}
prometheusSpec:
resources:
requests:
cpu: 100m
memory: 500Mi
limits:
cpu: 200m
memory: 1000Mi
defaultRules:
create: {default_rules}
rules:
@ -77,31 +117,164 @@ defaultRules:
windows: true
windowsMonitoring:
enabled: {windows_monitoring}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
grafana:
enabled: {grafana}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
initChownData:
resources:
requests:
cpu: 10m
memory: 50Mi
limits:
cpu: 50m
memory: 100Mi
sidecar:
resources:
requests:
cpu: 10m
memory: 50Mi
limits:
cpu: 50m
memory: 100Mi
kubernetesServiceMonitors:
enabled: {kubernetes_service_monitors}
kubeApiServer:
enabled: {kubernetes_api_server}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubelet:
enabled: {kubelet}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubeControllerManager:
enabled: {kube_controller_manager}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
coreDns:
enabled: {core_dns}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubeEtcd:
enabled: {kube_etcd}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubeScheduler:
enabled: {kube_scheduler}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubeProxy:
enabled: {kube_proxy}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
kubeStateMetrics:
enabled: {kube_state_metrics}
kube-state-metrics:
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
nodeExporter:
enabled: {node_exporter}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
prometheus-node-exporter:
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 200m
memory: 250Mi
prometheusOperator:
enabled: {prometheus_operator}
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 100m
memory: 200Mi
prometheusConfigReloader:
resources:
requests:
cpu: 100m
memory: 150Mi
limits:
cpu: 100m
memory: 200Mi
admissionWebhooks:
deployment:
resources:
limits:
cpu: 10m
memory: 100Mi
requests:
cpu: 10m
memory: 100Mi
patch:
resources:
limits:
cpu: 10m
memory: 100Mi
requests:
cpu: 10m
memory: 100Mi
"#,
);
@ -160,6 +333,30 @@ prometheusOperator:
alertmanager: AlertManager {
enabled: config.alert_manager,
config: alert_manager_channel_config,
alertmanager_spec: AlertManagerSpec {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
init_config_reloader: ConfigReloader {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
},
};
@ -200,7 +397,7 @@ prometheusOperator:
debug!("full values.yaml: \n {:#}", values);
HelmChartScore {
namespace: Some(NonBlankString::from_str(&config.namespace).unwrap()),
namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()),
release_name: NonBlankString::from_str("kube-prometheus").unwrap(),
chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/kube-prometheus-stack",

View File

@ -9,6 +9,7 @@ use crate::{
topology::{
HelmCommand, Topology,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret},
tenant::TenantManager,
},
};
@ -19,7 +20,7 @@ pub struct HelmPrometheusAlertingScore {
pub service_monitors: Vec<ServiceMonitor>,
}
impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
impl<T: Topology + HelmCommand + TenantManager> Score<T> for HelmPrometheusAlertingScore {
Review

as noted in a comment below, should remove TenantManager later

as noted in a comment below, should remove TenantManager later
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
let config = Arc::new(Mutex::new(KubePrometheusConfig::new()));
config
@ -27,7 +28,7 @@ impl<T: Topology + HelmCommand> Score<T> for HelmPrometheusAlertingScore {
.expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone();
Box::new(AlertingInterpret {
sender: Prometheus { config },
sender: Prometheus::new(),
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})

View File

@ -1,7 +1,7 @@
use std::sync::{Arc, Mutex};
use async_trait::async_trait;
use log::debug;
use log::{debug, error};
use serde::Serialize;
use crate::{
@ -10,9 +10,10 @@ use crate::{
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
score,
topology::{
HelmCommand, Topology,
HelmCommand, K8sAnywhereTopology, Topology,
installable::Installable,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
tenant::TenantManager,
},
};
@ -33,7 +34,12 @@ impl AlertSender for Prometheus {
}
#[async_trait]
impl<T: Topology + HelmCommand> Installable<T> for Prometheus {
impl<T: Topology + HelmCommand + TenantManager> Installable<T> for Prometheus {
async fn configure(&self, _inventory: &Inventory, topology: &T) -> Result<(), InterpretError> {
self.configure_with_topology(topology).await;
Ok(())
}
async fn ensure_installed(
&self,
inventory: &Inventory,
@ -50,6 +56,23 @@ pub struct Prometheus {
}
impl Prometheus {
pub fn new() -> Self {
Self {
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
}
}
pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
let ns = topology
Review

Higher level components such as Monitoring, Alerting, etc. Should not be aware of the Tenant concept.

The topology itself should manage internally the logic related to the tenant.

error!("This must be refactored, see comments in pr #74");

Higher level components such as Monitoring, Alerting, etc. Should not be aware of the Tenant concept. The topology itself should manage internally the logic related to the tenant. error!("This must be refactored, see comments in pr #74");
.get_tenant_config()
.await
.map(|cfg| cfg.name.clone())
.unwrap_or_else(|| "monitoring".to_string());
error!("This must be refactored, see comments in pr #74");
debug!("NS: {}", ns);
self.config.lock().unwrap().namespace = Some(ns);
}
pub async fn install_receiver(
&self,
prometheus_receiver: &dyn PrometheusReceiver,

View File

@ -1,4 +1,4 @@
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use async_trait::async_trait;
use serde::Serialize;
@ -16,9 +16,17 @@ pub struct AlertManagerValues {
pub alertmanager: AlertManager,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct AlertManager {
pub enabled: bool,
pub config: AlertManagerConfig,
pub alertmanager_spec: AlertManagerSpec,
pub init_config_reloader: ConfigReloader,
}
#[derive(Debug, Clone, Serialize)]
pub struct ConfigReloader {
pub resources: Resources,
}
#[derive(Debug, Clone, Serialize)]
@ -43,6 +51,30 @@ pub struct AlertManagerChannelConfig {
pub channel_receiver: Value,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct AlertManagerSpec {
pub(crate) resources: Resources,
}
#[derive(Debug, Clone, Serialize)]
pub struct Resources {
Review

I wonder if we can replace this with structs from k8s-openapi or kube-rs

I wonder if we can replace this with structs from k8s-openapi or kube-rs
pub limits: Limits,
pub requests: Requests,
}
#[derive(Debug, Clone, Serialize)]
pub struct Limits {
pub memory: String,
pub cpu: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct Requests {
pub memory: String,
pub cpu: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct AlertManagerAdditionalPromRules {
#[serde(flatten)]