diff --git a/adr/020-monitoring-alerting-architecture.md b/adr/020-monitoring-alerting-architecture.md new file mode 100644 index 00000000..b529b99f --- /dev/null +++ b/adr/020-monitoring-alerting-architecture.md @@ -0,0 +1,318 @@ +# Architecture Decision Record: Monitoring and Alerting Architecture + +Initial Author: Willem Rolleman, Jean-Gabriel Carrier + +Initial Date: March 9, 2026 + +Last Updated Date: March 9, 2026 + +## Status + +Accepted + +Supersedes: [ADR-010](010-monitoring-and-alerting.md) + +## Context + +Harmony needs a unified approach to monitoring and alerting across different infrastructure targets: + +1. **Cluster-level monitoring**: Administrators managing entire Kubernetes/OKD clusters need to define cluster-wide alerts, receivers, and scrape targets. + +2. **Tenant-level monitoring**: Multi-tenant clusters where teams are confined to namespaces need monitoring scoped to their resources. + +3. **Application-level monitoring**: Developers deploying applications want zero-config monitoring that "just works" for their services. + +The monitoring landscape is fragmented: +- **OKD/OpenShift**: Built-in Prometheus with AlertmanagerConfig CRDs +- **KubePrometheus**: Helm-based stack with PrometheusRule CRDs +- **RHOB (Red Hat Observability)**: Operator-based with MonitoringStack CRDs +- **Standalone Prometheus**: Raw Prometheus deployments + +Each system has different CRDs, different installation methods, and different configuration APIs. + +## Decision + +We implement a **trait-based architecture with compile-time capability verification** that provides: + +1. **Type-safe abstractions** via parameterized traits: `AlertReceiver`, `AlertRule`, `ScrapeTarget` +2. **Compile-time topology compatibility** via the `Observability` capability bound +3. **Three levels of abstraction**: Cluster, Tenant, and Application monitoring +4. **Pre-built alert rules** as functions that return typed structs + +### Core Traits + +```rust +// domain/topology/monitoring.rs + +/// Marker trait for systems that send alerts (Prometheus, etc.) +pub trait AlertSender: Send + Sync + std::fmt::Debug { + fn name(&self) -> String; +} + +/// Defines how a receiver (Discord, Slack, etc.) builds its configuration +/// for a specific sender type +pub trait AlertReceiver: std::fmt::Debug + Send + Sync { + fn build(&self) -> Result; + fn name(&self) -> String; + fn clone_box(&self) -> Box>; +} + +/// Defines how an alert rule builds its PrometheusRule configuration +pub trait AlertRule: std::fmt::Debug + Send + Sync { + fn build_rule(&self) -> Result; + fn name(&self) -> String; + fn clone_box(&self) -> Box>; +} + +/// Capability that topologies implement to support monitoring +pub trait Observability { + async fn install_alert_sender(&self, sender: &S, inventory: &Inventory) + -> Result; + async fn install_receivers(&self, sender: &S, inventory: &Inventory, + receivers: Option>>>) -> Result<...>; + async fn install_rules(&self, sender: &S, inventory: &Inventory, + rules: Option>>>) -> Result<...>; + async fn add_scrape_targets(&self, sender: &S, inventory: &Inventory, + scrape_targets: Option>>>) -> Result<...>; + async fn ensure_monitoring_installed(&self, sender: &S, inventory: &Inventory) + -> Result<...>; +} +``` + +### Alert Sender Types + +Each monitoring stack is a distinct `AlertSender`: + +| Sender | Module | Use Case | +|--------|--------|----------| +| `OpenshiftClusterAlertSender` | `monitoring/okd/` | OKD/OpenShift built-in monitoring | +| `KubePrometheus` | `monitoring/kube_prometheus/` | Helm-deployed kube-prometheus-stack | +| `Prometheus` | `monitoring/prometheus/` | Standalone Prometheus via Helm | +| `RedHatClusterObservability` | `monitoring/red_hat_cluster_observability/` | RHOB operator | +| `Grafana` | `monitoring/grafana/` | Grafana-managed alerting | + +### Three Levels of Monitoring + +#### 1. Cluster-Level Monitoring + +For cluster administrators. Full control over monitoring infrastructure. + +```rust +// examples/okd_cluster_alerts/src/main.rs +OpenshiftClusterAlertScore { + sender: OpenshiftClusterAlertSender, + receivers: vec![Box::new(DiscordReceiver { ... })], + rules: vec![Box::new(alert_rules)], + scrape_targets: Some(vec![Box::new(external_exporters)]), +} +``` + +**Characteristics:** +- Cluster-scoped CRDs and resources +- Can add external scrape targets (outside cluster) +- Manages Alertmanager configuration +- Requires cluster-admin privileges + +#### 2. Tenant-Level Monitoring + +For teams confined to namespaces. The topology determines tenant context. + +```rust +// The topology's Observability impl handles namespace scoping +impl Observability for K8sAnywhereTopology { + async fn install_rules(&self, sender: &KubePrometheus, ...) { + // Topology knows if it's tenant-scoped + let namespace = self.get_tenant_config().await + .map(|t| t.name) + .unwrap_or("default"); + // Install rules in tenant namespace + } +} +``` + +**Characteristics:** +- Namespace-scoped resources +- Cannot modify cluster-level monitoring config +- May have restricted receiver types +- Runtime validation of permissions (cannot be fully compile-time) + +#### 3. Application-Level Monitoring + +For developers. Zero-config, opinionated monitoring. + +```rust +// modules/application/features/monitoring.rs +pub struct Monitoring { + pub application: Arc, + pub alert_receiver: Vec>>, +} + +impl + TenantManager + ...> + ApplicationFeature for Monitoring +{ + async fn ensure_installed(&self, topology: &T) -> Result<...> { + // Auto-creates ServiceMonitor + // Auto-installs Ntfy for notifications + // Handles tenant namespace automatically + // Wires up sensible defaults + } +} +``` + +**Characteristics:** +- Automatic ServiceMonitor creation +- Opinionated notification channel (Ntfy) +- Tenant-aware via topology +- Minimal configuration required + +## Rationale + +### Why Generic Traits Instead of Unified Types? + +Each monitoring stack (OKD, KubePrometheus, RHOB) has fundamentally different CRDs: + +```rust +// OKD uses AlertmanagerConfig with different structure +AlertmanagerConfig { spec: { receivers: [...] } } + +// RHOB uses secret references for webhook URLs +MonitoringStack { spec: { alertmanagerConfig: { discordConfigs: [{ apiURL: { key: "..." } }] } } } + +// KubePrometheus uses Alertmanager CRD with different field names +Alertmanager { spec: { config: { receivers: [...] } } } +``` + +A unified type would either: +1. Be a lowest-common-denominator (loses stack-specific features) +2. Be a complex union type (hard to use, easy to misconfigure) + +Generic traits let each stack express its configuration naturally while providing a consistent interface. + +### Why Compile-Time Capability Bounds? + +```rust +impl> Score + for OpenshiftClusterAlertScore { ... } +``` + +This fails at compile time if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't support OKD monitoring. This prevents the "config-is-valid-but-platform-is-wrong" errors that Harmony was designed to eliminate. + +### Why Not a MonitoringStack Abstraction (V2 Approach)? + +The V2 approach proposed a unified `MonitoringStack` that hides sender selection: + +```rust +// V2 approach - rejected +MonitoringStack::new(MonitoringApiVersion::V2CRD) + .add_alert_channel(discord) +``` + +**Problems:** +1. Hides which sender you're using, losing compile-time guarantees +2. "Version selection" actually chooses between fundamentally different systems +3. Would need to handle all stack-specific features through a generic interface + +The current approach is explicit: you choose `OpenshiftClusterAlertSender` and the compiler verifies compatibility. + +### Why Runtime Validation for Tenants? + +Tenant confinement is determined at runtime by the topology and K8s RBAC. We cannot know at compile time whether a user has cluster-admin or namespace-only access. + +Options considered: +1. **Compile-time tenant markers** - Would require modeling entire RBAC hierarchy in types. Over-engineering. +2. **Runtime validation** - Current approach. Fails with clear K8s permission errors if insufficient access. +3. **No tenant support** - Would exclude a major use case. + +Runtime validation is the pragmatic choice. The failure mode is clear (K8s API error) and occurs early in execution. + +> Note : we will eventually have compile time validation for such things. Rust macros are powerful and we could discover the actual capabilities we're dealing with, similar to sqlx approach in query! macros. + +## Consequences + +### Pros + +1. **Type Safety**: Invalid configurations are caught at compile time +2. **Extensibility**: Adding a new monitoring stack requires implementing traits, not modifying core code +3. **Clear Separation**: Cluster/Tenant/Application levels have distinct entry points +4. **Reusable Rules**: Pre-built alert rules as functions (`high_pvc_fill_rate_over_two_days()`) +5. **CRD Accuracy**: Type definitions match actual Kubernetes CRDs exactly + +### Cons + +1. **Implementation Explosion**: `DiscordReceiver` implements `AlertReceiver` for each sender type (3+ implementations) +2. **Learning Curve**: Understanding the trait hierarchy takes time +3. **clone_box Boilerplate**: Required for trait object cloning (3 lines per impl) + +### Mitigations + +- Implementation explosion is contained: each receiver type has O(senders) implementations, but receivers are rare compared to rules +- Learning curve is documented with examples at each level +- clone_box boilerplate is minimal and copy-paste + +## Alternatives Considered + +### Unified MonitoringStack Type + +See "Why Not a MonitoringStack Abstraction" above. Rejected for losing compile-time safety. + +### Helm-Only Approach + +Use `HelmScore` directly for each monitoring deployment. Rejected because: +- No type safety for alert rules +- Cannot compose with application features +- No tenant awareness + +### Separate Modules Per Use Case + +Have `cluster_monitoring/`, `tenant_monitoring/`, `app_monitoring/` as separate modules. Rejected because: +- Massive code duplication +- No shared abstraction for receivers/rules +- Adding a feature requires three implementations + +## Implementation Notes + +### Module Structure + +``` +modules/monitoring/ +├── mod.rs # Public exports +├── alert_channel/ # Receivers (Discord, Webhook) +├── alert_rule/ # Rules and pre-built alerts +│ ├── prometheus_alert_rule.rs +│ └── alerts/ # Library of pre-built rules +│ ├── k8s/ # K8s-specific (pvc, pod, memory) +│ └── infra/ # Infrastructure (opnsense, dell) +├── okd/ # OpenshiftClusterAlertSender +├── kube_prometheus/ # KubePrometheus +├── prometheus/ # Prometheus +├── red_hat_cluster_observability/ # RHOB +├── grafana/ # Grafana +├── application_monitoring/ # Application-level scores +└── scrape_target/ # External scrape targets +``` + +### Adding a New Alert Sender + +1. Create sender type: `pub struct MySender; impl AlertSender for MySender { ... }` +2. Implement `Observability` for topologies that support it +3. Create CRD types in `crd/` subdirectory +4. Implement `AlertReceiver` for existing receivers +5. Implement `AlertRule` for `AlertManagerRuleGroup` + +### Adding a New Alert Rule + +```rust +pub fn my_custom_alert() -> PrometheusAlertRule { + PrometheusAlertRule::new("MyAlert", "up == 0") + .for_duration("5m") + .label("severity", "critical") + .annotation("summary", "Service is down") +} +``` + +No trait implementation needed - `AlertManagerRuleGroup` already handles conversion. + +## Related ADRs + +- [ADR-013](013-monitoring-notifications.md): Notification channel selection (ntfy) +- [ADR-011](011-multi-tenant-cluster.md): Multi-tenant cluster architecture diff --git a/adr/020-monitoring-alerting-architecture/monitoring_v2/Cargo.toml b/adr/020-monitoring-alerting-architecture/monitoring_v2/Cargo.toml new file mode 100644 index 00000000..4638606e --- /dev/null +++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "example-monitoring-v2" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony-k8s = { path = "../../harmony-k8s" } +harmony_types = { path = "../../harmony_types" } +kube = { workspace = true } +schemars = "0.8" +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +serde_yaml = { workspace = true } +url = { workspace = true } +log = { workspace = true } +async-trait = { workspace = true } +k8s-openapi = { workspace = true } diff --git a/adr/020-monitoring-alerting-architecture/monitoring_v2/README.md b/adr/020-monitoring-alerting-architecture/monitoring_v2/README.md new file mode 100644 index 00000000..311b55e7 --- /dev/null +++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/README.md @@ -0,0 +1,91 @@ +# Monitoring v2 - Improved Architecture + +This example demonstrates the improved monitoring architecture that addresses the "WTF/minute" issues in the original design. + +## Key Improvements + +### 1. **Single AlertChannel Trait with Generic Sender** + +The original design required 9-12 implementations for each alert channel (Discord, Webhook, etc.) - one for each sender type. The new design uses a single trait with generic sender parameterization: + +pub trait AlertChannel { + async fn install_config(&self, sender: &Sender) -> Result; + fn name(&self) -> String; + fn as_any(&self) -> &dyn std::any::Any; +} + +**Benefits:** +- One Discord implementation works with all sender types +- Type safety at compile time +- No runtime dispatch overhead + +### 2. **MonitoringStack Abstraction** + +Instead of manually selecting CRDPrometheus vs KubePrometheus vs RHOBObservability, you now have a unified MonitoringStack that handles versioning: + +let monitoring_stack = MonitoringStack::new(MonitoringApiVersion::V2CRD) + .set_namespace("monitoring") + .add_alert_channel(discord_receiver) + .set_scrape_targets(vec![...]); + +**Benefits:** +- Single source of truth for monitoring configuration +- Easy to switch between monitoring versions +- Automatic version-specific configuration + +### 3. **TenantMonitoringScore - True Composition** + +The original monitoring_with_tenant example just put tenant and monitoring as separate items in a vec. The new design truly composes them: + +let tenant_score = TenantMonitoringScore::new("test-tenant", monitoring_stack); + +This creates a single score that: +- Has tenant context +- Has monitoring configuration +- Automatically installs monitoring scoped to tenant namespace + +**Benefits:** +- No more "two separate things" confusion +- Automatic tenant namespace scoping +- Clear ownership: tenant owns its monitoring + +### 4. **Versioned Monitoring APIs** + +Clear versioning makes it obvious which monitoring stack you're using: + +pub enum MonitoringApiVersion { + V1Helm, // Old Helm charts + V2CRD, // Current CRDs + V3RHOB, // RHOB (future) +} + +**Benefits:** +- No guessing which API version you're using +- Easy to migrate between versions +- Backward compatibility path + +## Comparison + +### Original Design (monitoring_with_tenant) +- Manual selection of each component +- Manual installation of both components +- Need to remember to pass both to harmony_cli::run +- Monitoring not scoped to tenant automatically + +### New Design (monitoring_v2) +- Single composed score +- One score does it all + +## Usage + +cd examples/monitoring_v2 +cargo run + +## Migration Path + +To migrate from the old design to the new: + +1. Replace individual alert channel implementations with AlertChannel +2. Use MonitoringStack instead of manual *Prometheus selection +3. Use TenantMonitoringScore instead of separate TenantScore + monitoring scores +4. Select monitoring version via MonitoringApiVersion diff --git a/adr/020-monitoring-alerting-architecture/monitoring_v2/src/lib.rs b/adr/020-monitoring-alerting-architecture/monitoring_v2/src/lib.rs new file mode 100644 index 00000000..5c0047c6 --- /dev/null +++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/src/lib.rs @@ -0,0 +1,343 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + + +use log::debug; +use serde::{Deserialize, Serialize}; +use serde_yaml::{Mapping, Value}; + +use harmony::data::Version; +use harmony::interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}; +use harmony::inventory::Inventory; +use harmony::score::Score; +use harmony::topology::{Topology, tenant::TenantManager}; + +use harmony_k8s::K8sClient; +use harmony_types::k8s_name::K8sName; +use harmony_types::net::Url; + +pub trait AlertSender: Send + Sync + std::fmt::Debug { + fn name(&self) -> String; + fn namespace(&self) -> String; +} + +#[derive(Debug)] +pub struct CRDPrometheus { + pub namespace: String, + pub client: Arc, +} + +impl AlertSender for CRDPrometheus { + fn name(&self) -> String { + "CRDPrometheus".to_string() + } + + fn namespace(&self) -> String { + self.namespace.clone() + } +} + +#[derive(Debug)] +pub struct RHOBObservability { + pub namespace: String, + pub client: Arc, +} + +impl AlertSender for RHOBObservability { + fn name(&self) -> String { + "RHOBObservability".to_string() + } + + fn namespace(&self) -> String { + self.namespace.clone() + } +} + +#[derive(Debug)] +pub struct KubePrometheus { + pub config: Arc>, +} + +impl Default for KubePrometheus { + fn default() -> Self { + Self::new() + } +} + +impl KubePrometheus { + pub fn new() -> Self { + Self { + config: Arc::new(Mutex::new(KubePrometheusConfig::new())), + } + } +} + +impl AlertSender for KubePrometheus { + fn name(&self) -> String { + "KubePrometheus".to_string() + } + + fn namespace(&self) -> String { + self.config.lock().unwrap().namespace.clone().unwrap_or_else(|| "monitoring".to_string()) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct KubePrometheusConfig { + pub namespace: Option, + #[serde(skip)] + pub alert_receiver_configs: Vec, +} + +impl KubePrometheusConfig { + pub fn new() -> Self { + Self { + namespace: None, + alert_receiver_configs: Vec::new(), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AlertManagerChannelConfig { + pub channel_receiver: serde_yaml::Value, + pub channel_route: serde_yaml::Value, +} + +impl Default for AlertManagerChannelConfig { + fn default() -> Self { + Self { + channel_receiver: serde_yaml::Value::Mapping(Default::default()), + channel_route: serde_yaml::Value::Mapping(Default::default()), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ScrapeTargetConfig { + pub service_name: String, + pub port: String, + pub path: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum MonitoringApiVersion { + V1Helm, + V2CRD, + V3RHOB, +} + +#[derive(Debug, Clone)] +pub struct MonitoringStack { + pub version: MonitoringApiVersion, + pub namespace: String, + pub alert_channels: Vec>, + pub scrape_targets: Vec, +} + +impl MonitoringStack { + pub fn new(version: MonitoringApiVersion) -> Self { + Self { + version, + namespace: "monitoring".to_string(), + alert_channels: Vec::new(), + scrape_targets: Vec::new(), + } + } + + pub fn set_namespace(mut self, namespace: &str) -> Self { + self.namespace = namespace.to_string(); + self + } + + pub fn add_alert_channel(mut self, channel: impl AlertSender + 'static) -> Self { + self.alert_channels.push(Arc::new(channel)); + self + } + + pub fn set_scrape_targets(mut self, targets: Vec<(&str, &str, String)>) -> Self { + self.scrape_targets = targets + .into_iter() + .map(|(name, port, path)| ScrapeTargetConfig { + service_name: name.to_string(), + port: port.to_string(), + path, + }) + .collect(); + self + } +} + +pub trait AlertChannel { + fn install_config(&self, sender: &Sender); + fn name(&self) -> String; +} + +#[derive(Debug, Clone)] +pub struct DiscordWebhook { + pub name: K8sName, + pub url: Url, + pub selectors: Vec>, +} + +impl DiscordWebhook { + fn get_config(&self) -> AlertManagerChannelConfig { + let mut route = Mapping::new(); + route.insert( + Value::String("receiver".to_string()), + Value::String(self.name.to_string()), + ); + route.insert( + Value::String("matchers".to_string()), + Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), + ); + + let mut receiver = Mapping::new(); + receiver.insert( + Value::String("name".to_string()), + Value::String(self.name.to_string()), + ); + + let mut discord_config = Mapping::new(); + discord_config.insert( + Value::String("webhook_url".to_string()), + Value::String(self.url.to_string()), + ); + + receiver.insert( + Value::String("discord_configs".to_string()), + Value::Sequence(vec![Value::Mapping(discord_config)]), + ); + + AlertManagerChannelConfig { + channel_receiver: Value::Mapping(receiver), + channel_route: Value::Mapping(route), + } + } +} + +impl AlertChannel for DiscordWebhook { + fn install_config(&self, sender: &CRDPrometheus) { + debug!("Installing Discord webhook for CRDPrometheus in namespace: {}", sender.namespace()); + debug!("Config: {:?}", self.get_config()); + debug!("Installed!"); + } + + fn name(&self) -> String { + "discord-webhook".to_string() + } +} + +impl AlertChannel for DiscordWebhook { + fn install_config(&self, sender: &RHOBObservability) { + debug!("Installing Discord webhook for RHOBObservability in namespace: {}", sender.namespace()); + debug!("Config: {:?}", self.get_config()); + debug!("Installed!"); + } + + fn name(&self) -> String { + "webhook-receiver".to_string() + } +} + +impl AlertChannel for DiscordWebhook { + fn install_config(&self, sender: &KubePrometheus) { + debug!("Installing Discord webhook for KubePrometheus in namespace: {}", sender.namespace()); + let config = sender.config.lock().unwrap(); + let ns = config.namespace.clone().unwrap_or_else(|| "monitoring".to_string()); + debug!("Namespace: {}", ns); + let mut config = sender.config.lock().unwrap(); + config.alert_receiver_configs.push(self.get_config()); + debug!("Installed!"); + } + + fn name(&self) -> String { + "discord-webhook".to_string() + } +} + +fn default_monitoring_stack() -> MonitoringStack { + MonitoringStack::new(MonitoringApiVersion::V2CRD) +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TenantMonitoringScore { + pub tenant_id: harmony_types::id::Id, + pub tenant_name: String, + #[serde(skip)] + #[serde(default = "default_monitoring_stack")] + pub monitoring_stack: MonitoringStack, +} + +impl TenantMonitoringScore { + pub fn new(tenant_name: &str, monitoring_stack: MonitoringStack) -> Self { + Self { + tenant_id: harmony_types::id::Id::default(), + tenant_name: tenant_name.to_string(), + monitoring_stack, + } + } +} + +impl Score for TenantMonitoringScore { + fn create_interpret(&self) -> Box> { + Box::new(TenantMonitoringInterpret { + score: self.clone(), + }) + } + + fn name(&self) -> String { + format!("{} monitoring [TenantMonitoringScore]", self.tenant_name) + } +} + +#[derive(Debug)] +pub struct TenantMonitoringInterpret { + pub score: TenantMonitoringScore, +} + +#[async_trait::async_trait] +impl Interpret for TenantMonitoringInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let tenant_config = topology.get_tenant_config().await.unwrap(); + let tenant_ns = tenant_config.name.clone(); + + match self.score.monitoring_stack.version { + MonitoringApiVersion::V1Helm => { + debug!("Installing Helm monitoring for tenant {}", tenant_ns); + } + MonitoringApiVersion::V2CRD => { + debug!("Installing CRD monitoring for tenant {}", tenant_ns); + } + MonitoringApiVersion::V3RHOB => { + debug!("Installing RHOB monitoring for tenant {}", tenant_ns); + } + } + + Ok(Outcome::success(format!( + "Installed monitoring stack for tenant {} with version {:?}", + self.score.tenant_name, + self.score.monitoring_stack.version + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("TenantMonitoringInterpret") + } + + fn get_version(&self) -> Version { + Version::from("1.0.0").unwrap() + } + + fn get_status(&self) -> InterpretStatus { + InterpretStatus::SUCCESS + } + + fn get_children(&self) -> Vec { + Vec::new() + } +} diff --git a/docs/README.md b/docs/README.md index e05f8872..fa4b431b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -31,3 +31,16 @@ Ready to build your own components? These guides show you how. - [**Writing a Score**](./guides/writing-a-score.md): Learn how to create your own `Score` and `Interpret` logic to define a new desired state. - [**Writing a Topology**](./guides/writing-a-topology.md): Learn how to model a new environment (like AWS, GCP, or custom hardware) as a `Topology`. - [**Adding Capabilities**](./guides/adding-capabilities.md): See how to add a `Capability` to your custom `Topology`. +- [**Coding Guide**](./coding-guide.md): Conventions and best practices for writing Harmony code. + +## 5. Module Documentation + +Deep dives into specific Harmony modules and features. + +- [**Monitoring and Alerting**](./monitoring.md): Comprehensive guide to cluster, tenant, and application-level monitoring with support for OKD, KubePrometheus, RHOB, and more. + +## 6. Architecture Decision Records + +Important architectural decisions are documented in the `adr/` directory: + +- [Full ADR Index](../adr/) diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 00000000..b4508e74 --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,443 @@ +# Monitoring and Alerting in Harmony + +Harmony provides a unified, type-safe approach to monitoring and alerting across Kubernetes, OpenShift, and bare-metal infrastructure. This guide explains the architecture and how to use it at different levels of abstraction. + +## Overview + +Harmony's monitoring module supports three distinct use cases: + +| Level | Who Uses It | What It Provides | +|-------|-------------|------------------| +| **Cluster** | Cluster administrators | Full control over monitoring stack, cluster-wide alerts, external scrape targets | +| **Tenant** | Platform teams | Namespace-scoped monitoring in multi-tenant environments | +| **Application** | Application developers | Zero-config monitoring that "just works" | + +Each level builds on the same underlying abstractions, ensuring consistency while providing appropriate complexity for each audience. + +## Core Concepts + +### AlertSender + +An `AlertSender` represents the system that evaluates alert rules and sends notifications. Harmony supports multiple monitoring stacks: + +| Sender | Description | Use When | +|--------|-------------|----------| +| `OpenshiftClusterAlertSender` | OKD/OpenShift built-in monitoring | Running on OKD/OpenShift | +| `KubePrometheus` | kube-prometheus-stack via Helm | Standard Kubernetes, need full stack | +| `Prometheus` | Standalone Prometheus | Custom Prometheus deployment | +| `RedHatClusterObservability` | RHOB operator | Red Hat managed clusters | +| `Grafana` | Grafana-managed alerting | Grafana as primary alerting layer | + +### AlertReceiver + +An `AlertReceiver` defines where alerts are sent (Discord, Slack, email, webhook, etc.). Receivers are parameterized by sender type because each monitoring stack has different configuration formats. + +```rust +pub trait AlertReceiver { + fn build(&self) -> Result; + fn name(&self) -> String; +} +``` + +Built-in receivers: +- `DiscordReceiver` - Discord webhooks +- `WebhookReceiver` - Generic HTTP webhooks + +### AlertRule + +An `AlertRule` defines a Prometheus alert expression. Rules are also parameterized by sender to handle different CRD formats. + +```rust +pub trait AlertRule { + fn build_rule(&self) -> Result; + fn name(&self) -> String; +} +``` + +### Observability Capability + +Topologies implement `Observability` to indicate they support a specific alert sender: + +```rust +impl Observability for K8sAnywhereTopology { + async fn install_receivers(&self, sender, inventory, receivers) { ... } + async fn install_rules(&self, sender, inventory, rules) { ... } + // ... +} +``` + +This provides **compile-time verification**: if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't implement `Observability`, the code won't compile. + +--- + +## Level 1: Cluster Monitoring + +Cluster monitoring is for administrators who need full control over the monitoring infrastructure. This includes: +- Installing/managing the monitoring stack +- Configuring cluster-wide alert receivers +- Defining cluster-level alert rules +- Adding external scrape targets (e.g., bare-metal servers, firewalls) + +### Example: OKD Cluster Alerts + +```rust +use harmony::{ + modules::monitoring::{ + alert_channel::discord_alert_channel::DiscordReceiver, + alert_rule::{alerts::k8s::pvc::high_pvc_fill_rate_over_two_days, prometheus_alert_rule::AlertManagerRuleGroup}, + okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore, + scrape_target::prometheus_node_exporter::PrometheusNodeExporter, + }, + topology::{K8sAnywhereTopology, monitoring::{AlertMatcher, AlertRoute, MatchOp}}, +}; + +let severity_matcher = AlertMatcher { + label: "severity".to_string(), + operator: MatchOp::Eq, + value: "critical".to_string(), +}; + +let rule_group = AlertManagerRuleGroup::new( + "cluster-rules", + vec![high_pvc_fill_rate_over_two_days()], +); + +let external_exporter = PrometheusNodeExporter { + job_name: "firewall".to_string(), + metrics_path: "/metrics".to_string(), + listen_address: ip!("192.168.1.1"), + port: 9100, + ..Default::default() +}; + +harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), + vec![Box::new(OpenshiftClusterAlertScore { + sender: OpenshiftClusterAlertSender, + receivers: vec![Box::new(DiscordReceiver { + name: "critical-alerts".to_string(), + url: hurl!("https://discord.com/api/webhooks/..."), + route: AlertRoute { + matchers: vec![severity_matcher], + ..AlertRoute::default("critical-alerts".to_string()) + }, + })], + rules: vec![Box::new(rule_group)], + scrape_targets: Some(vec![Box::new(external_exporter)]), + })], + None, +).await?; +``` + +### What This Does + +1. **Enables cluster monitoring** - Activates OKD's built-in Prometheus +2. **Enables user workload monitoring** - Allows namespace-scoped rules +3. **Configures Alertmanager** - Adds Discord receiver with route matching +4. **Deploys alert rules** - Creates `AlertingRule` CRD with PVC fill rate alert +5. **Adds external scrape target** - Configures Prometheus to scrape the firewall + +### Compile-Time Safety + +The `OpenshiftClusterAlertScore` requires: + +```rust +impl> Score + for OpenshiftClusterAlertScore +``` + +If `K8sAnywhereTopology` didn't implement `Observability`, this code would fail to compile. You cannot accidentally deploy OKD alerts to a cluster that doesn't support them. + +--- + +## Level 2: Tenant Monitoring + +In multi-tenant clusters, teams are often confined to specific namespaces. Tenant monitoring adapts to this constraint: + +- Resources are deployed in the tenant's namespace +- Cannot modify cluster-level monitoring configuration +- The topology determines namespace context at runtime + +### How It Works + +The topology's `Observability` implementation handles tenant scoping: + +```rust +impl Observability for K8sAnywhereTopology { + async fn install_rules(&self, sender, inventory, rules) { + // Topology knows if it's tenant-scoped + let namespace = self.get_tenant_config().await + .map(|t| t.name) + .unwrap_or_else(|| "monitoring".to_string()); + + // Rules are installed in the appropriate namespace + for rule in rules.unwrap_or_default() { + let score = KubePrometheusRuleScore { + sender: sender.clone(), + rule, + namespace: namespace.clone(), // Tenant namespace + }; + score.create_interpret().execute(inventory, self).await?; + } + } +} +``` + +### Tenant vs Cluster Resources + +| Resource | Cluster-Level | Tenant-Level | +|----------|---------------|--------------| +| Alertmanager config | Global receivers | Namespaced receivers (where supported) | +| PrometheusRules | Cluster-wide alerts | Namespace alerts only | +| ServiceMonitors | Any namespace | Own namespace only | +| External scrape targets | Can add | Cannot add (cluster config) | + +### Runtime Validation + +Tenant constraints are validated at runtime via Kubernetes RBAC. If a tenant-scoped deployment attempts cluster-level operations, it fails with a clear permission error from the Kubernetes API. + +This cannot be fully compile-time because tenant context is determined by who's running the code and what permissions they have—information only available at runtime. + +--- + +## Level 3: Application Monitoring + +Application monitoring provides zero-config, opinionated monitoring for developers. Just add the `Monitoring` feature to your application and it works. + +### Example + +```rust +use harmony::modules::{ + application::{Application, ApplicationFeature}, + monitoring::alert_channel::webhook_receiver::WebhookReceiver, +}; + +// Define your application +let my_app = MyApplication::new(); + +// Add monitoring as a feature +let monitoring = Monitoring { + application: Arc::new(my_app), + alert_receiver: vec![], // Uses defaults +}; + +// Install with the application +my_app.add_feature(monitoring); +``` + +### What Application Monitoring Provides + +1. **Automatic ServiceMonitor** - Creates a ServiceMonitor for your application's pods +2. **Ntfy Notification Channel** - Auto-installs and configures Ntfy for push notifications +3. **Tenant Awareness** - Automatically scopes to the correct namespace +4. **Sensible Defaults** - Pre-configured alert routes and receivers + +### Under the Hood + +```rust +impl + TenantManager> + ApplicationFeature for Monitoring +{ + async fn ensure_installed(&self, topology: &T) -> Result<...> { + // 1. Get tenant namespace (or use app name) + let namespace = topology.get_tenant_config().await + .map(|ns| ns.name.clone()) + .unwrap_or_else(|| self.application.name()); + + // 2. Create ServiceMonitor for the app + let app_service_monitor = ServiceMonitor { + metadata: ObjectMeta { + name: Some(self.application.name()), + namespace: Some(namespace.clone()), + ..Default::default() + }, + spec: ServiceMonitorSpec::default(), + }; + + // 3. Install Ntfy for notifications + let ntfy = NtfyScore { namespace, host }; + ntfy.interpret(&Inventory::empty(), topology).await?; + + // 4. Wire up webhook receiver to Ntfy + let ntfy_receiver = WebhookReceiver { ... }; + + // 5. Execute monitoring score + alerting_score.interpret(&Inventory::empty(), topology).await?; + } +} +``` + +--- + +## Pre-Built Alert Rules + +Harmony provides a library of common alert rules in `modules/monitoring/alert_rule/alerts/`: + +### Kubernetes Alerts (`alerts/k8s/`) + +```rust +use harmony::modules::monitoring::alert_rule::alerts::k8s::{ + pod::pod_failed, + pvc::high_pvc_fill_rate_over_two_days, + memory_usage::alert_high_memory_usage, +}; + +let rules = AlertManagerRuleGroup::new("k8s-rules", vec![ + pod_failed(), + high_pvc_fill_rate_over_two_days(), + alert_high_memory_usage(), +]); +``` + +Available rules: +- `pod_failed()` - Pod in failed state +- `alert_container_restarting()` - Container restart loop +- `alert_pod_not_ready()` - Pod not ready for extended period +- `high_pvc_fill_rate_over_two_days()` - PVC will fill within 2 days +- `alert_high_memory_usage()` - Memory usage above threshold +- `alert_high_cpu_usage()` - CPU usage above threshold + +### Infrastructure Alerts (`alerts/infra/`) + +```rust +use harmony::modules::monitoring::alert_rule::alerts::infra::opnsense::high_http_error_rate; + +let rules = AlertManagerRuleGroup::new("infra-rules", vec![ + high_http_error_rate(), +]); +``` + +### Creating Custom Rules + +```rust +use harmony::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn my_custom_alert() -> PrometheusAlertRule { + PrometheusAlertRule::new("MyServiceDown", "up{job=\"my-service\"} == 0") + .for_duration("5m") + .label("severity", "critical") + .annotation("summary", "My service is down") + .annotation("description", "The my-service job has been down for more than 5 minutes") +} +``` + +--- + +## Alert Receivers + +### Discord Webhook + +```rust +use harmony::modules::monitoring::alert_channel::discord_alert_channel::DiscordReceiver; +use harmony::topology::monitoring::{AlertRoute, AlertMatcher, MatchOp}; + +let discord = DiscordReceiver { + name: "ops-alerts".to_string(), + url: hurl!("https://discord.com/api/webhooks/123456/abcdef"), + route: AlertRoute { + receiver: "ops-alerts".to_string(), + matchers: vec![AlertMatcher { + label: "severity".to_string(), + operator: MatchOp::Eq, + value: "critical".to_string(), + }], + group_by: vec!["alertname".to_string()], + repeat_interval: Some("30m".to_string()), + continue_matching: false, + children: vec![], + }, +}; +``` + +### Generic Webhook + +```rust +use harmony::modules::monitoring::alert_channel::webhook_receiver::WebhookReceiver; + +let webhook = WebhookReceiver { + name: "custom-webhook".to_string(), + url: hurl!("https://api.example.com/alerts"), + route: AlertRoute::default("custom-webhook".to_string()), +}; +``` + +--- + +## Adding a New Monitoring Stack + +To add support for a new monitoring stack: + +1. **Create the sender type** in `modules/monitoring/my_sender/mod.rs`: + ```rust + #[derive(Debug, Clone)] + pub struct MySender; + + impl AlertSender for MySender { + fn name(&self) -> String { "MySender".to_string() } + } + ``` + +2. **Define CRD types** in `modules/monitoring/my_sender/crd/`: + ```rust + #[derive(CustomResource, Debug, Serialize, Deserialize, Clone)] + #[kube(group = "monitoring.example.com", version = "v1", kind = "MyAlertRule")] + pub struct MyAlertRuleSpec { ... } + ``` + +3. **Implement Observability** in `domain/topology/k8s_anywhere/observability/my_sender.rs`: + ```rust + impl Observability for K8sAnywhereTopology { + async fn install_receivers(&self, sender, inventory, receivers) { ... } + async fn install_rules(&self, sender, inventory, rules) { ... } + // ... + } + ``` + +4. **Implement receiver conversions** for existing receivers: + ```rust + impl AlertReceiver for DiscordReceiver { + fn build(&self) -> Result { + // Convert DiscordReceiver to MySender's format + } + } + ``` + +5. **Create score types**: + ```rust + pub struct MySenderAlertScore { + pub sender: MySender, + pub receivers: Vec>>, + pub rules: Vec>>, + } + ``` + +--- + +## Architecture Principles + +### Type Safety Over Flexibility + +Each monitoring stack has distinct CRDs and configuration formats. Rather than a unified "MonitoringStack" type that loses stack-specific features, we use generic traits that provide type safety while allowing each stack to express its unique configuration. + +### Compile-Time Capability Verification + +The `Observability` bound ensures you can't deploy OKD alerts to a KubePrometheus cluster. The compiler catches platform mismatches before deployment. + +### Explicit Over Implicit + +Monitoring stacks are chosen explicitly (`OpenshiftClusterAlertSender` vs `KubePrometheus`). There's no "auto-detection" that could lead to surprising behavior. + +### Three Levels, One Foundation + +Cluster, tenant, and application monitoring all use the same traits (`AlertSender`, `AlertReceiver`, `AlertRule`). The difference is in how scores are constructed and how topologies interpret them. + +--- + +## Related Documentation + +- [ADR-020: Monitoring and Alerting Architecture](../adr/020-monitoring-alerting-architecture.md) +- [ADR-013: Monitoring Notifications (ntfy)](../adr/013-monitoring-notifications.md) +- [ADR-011: Multi-Tenant Cluster Architecture](../adr/011-multi-tenant-cluster.md) +- [Coding Guide](coding-guide.md) +- [Core Concepts](concepts.md) diff --git a/examples/application_monitoring_with_tenant/src/main.rs b/examples/application_monitoring_with_tenant/src/main.rs index 8f6783c3..7e9ffd5f 100644 --- a/examples/application_monitoring_with_tenant/src/main.rs +++ b/examples/application_monitoring_with_tenant/src/main.rs @@ -7,7 +7,7 @@ use harmony::{ monitoring::alert_channel::webhook_receiver::WebhookReceiver, tenant::TenantScore, }, - topology::{K8sAnywhereTopology, tenant::TenantConfig}, + topology::{K8sAnywhereTopology, monitoring::AlertRoute, tenant::TenantConfig}, }; use harmony_types::id::Id; use harmony_types::net::Url; @@ -33,9 +33,14 @@ async fn main() { service_port: 3000, }); + let receiver_name = "sample-webhook-receiver".to_string(); + let webhook_receiver = WebhookReceiver { - name: "sample-webhook-receiver".to_string(), + name: receiver_name.clone(), url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()), + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; let app = ApplicationScore { diff --git a/examples/monitoring/src/main.rs b/examples/monitoring/src/main.rs index 7037094d..d1304db7 100644 --- a/examples/monitoring/src/main.rs +++ b/examples/monitoring/src/main.rs @@ -1,37 +1,45 @@ -use std::collections::HashMap; +use std::{ + collections::HashMap, + sync::{Arc, Mutex}, +}; use harmony::{ inventory::Inventory, - modules::{ - monitoring::{ - alert_channel::discord_alert_channel::DiscordWebhook, - alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, - kube_prometheus::{ - helm_prometheus_alert_score::HelmPrometheusAlertingScore, - types::{ - HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor, - ServiceMonitorEndpoint, + modules::monitoring::{ + alert_channel::discord_alert_channel::DiscordReceiver, + alert_rule::{ + alerts::{ + infra::dell_server::{ + alert_global_storage_status_critical, + alert_global_storage_status_non_recoverable, + global_storage_status_degraded_non_critical, }, + k8s::pvc::high_pvc_fill_rate_over_two_days, }, + prometheus_alert_rule::AlertManagerRuleGroup, }, - prometheus::alerts::{ - infra::dell_server::{ - alert_global_storage_status_critical, alert_global_storage_status_non_recoverable, - global_storage_status_degraded_non_critical, + kube_prometheus::{ + helm::config::KubePrometheusConfig, + kube_prometheus_alerting_score::KubePrometheusAlertingScore, + types::{ + HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor, + ServiceMonitorEndpoint, }, - k8s::pvc::high_pvc_fill_rate_over_two_days, }, }, - topology::K8sAnywhereTopology, + topology::{K8sAnywhereTopology, monitoring::AlertRoute}, }; use harmony_types::{k8s_name::K8sName, net::Url}; #[tokio::main] async fn main() { - let discord_receiver = DiscordWebhook { - name: K8sName("test-discord".to_string()), + let receiver_name = "test-discord".to_string(); + let discord_receiver = DiscordReceiver { + name: receiver_name.clone(), url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), - selectors: vec![], + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days(); @@ -70,10 +78,15 @@ async fn main() { endpoints: vec![service_monitor_endpoint], ..Default::default() }; - let alerting_score = HelmPrometheusAlertingScore { + + let config = Arc::new(Mutex::new(KubePrometheusConfig::new())); + + let alerting_score = KubePrometheusAlertingScore { receivers: vec![Box::new(discord_receiver)], rules: vec![Box::new(additional_rules), Box::new(additional_rules2)], service_monitors: vec![service_monitor], + scrape_targets: None, + config, }; harmony_cli::run( diff --git a/examples/monitoring_with_tenant/src/main.rs b/examples/monitoring_with_tenant/src/main.rs index f67f9d8a..6afc302c 100644 --- a/examples/monitoring_with_tenant/src/main.rs +++ b/examples/monitoring_with_tenant/src/main.rs @@ -1,24 +1,32 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{ + collections::HashMap, + str::FromStr, + sync::{Arc, Mutex}, +}; use harmony::{ inventory::Inventory, modules::{ monitoring::{ - alert_channel::discord_alert_channel::DiscordWebhook, - alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, + alert_channel::discord_alert_channel::DiscordReceiver, + alert_rule::{ + alerts::k8s::pvc::high_pvc_fill_rate_over_two_days, + prometheus_alert_rule::AlertManagerRuleGroup, + }, kube_prometheus::{ - helm_prometheus_alert_score::HelmPrometheusAlertingScore, + helm::config::KubePrometheusConfig, + kube_prometheus_alerting_score::KubePrometheusAlertingScore, types::{ HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor, ServiceMonitorEndpoint, }, }, }, - prometheus::alerts::k8s::pvc::high_pvc_fill_rate_over_two_days, tenant::TenantScore, }, topology::{ K8sAnywhereTopology, + monitoring::AlertRoute, tenant::{ResourceLimits, TenantConfig, TenantNetworkPolicy}, }, }; @@ -42,10 +50,13 @@ async fn main() { }, }; - let discord_receiver = DiscordWebhook { - name: K8sName("test-discord".to_string()), + let receiver_name = "test-discord".to_string(); + let discord_receiver = DiscordReceiver { + name: receiver_name.clone(), url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), - selectors: vec![], + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days(); @@ -74,10 +85,14 @@ async fn main() { ..Default::default() }; - let alerting_score = HelmPrometheusAlertingScore { + let config = Arc::new(Mutex::new(KubePrometheusConfig::new())); + + let alerting_score = KubePrometheusAlertingScore { receivers: vec![Box::new(discord_receiver)], rules: vec![Box::new(additional_rules)], service_monitors: vec![service_monitor], + scrape_targets: None, + config, }; harmony_cli::run( diff --git a/examples/okd_cluster_alerts/src/main.rs b/examples/okd_cluster_alerts/src/main.rs index 93dac3b4..beed679f 100644 --- a/examples/okd_cluster_alerts/src/main.rs +++ b/examples/okd_cluster_alerts/src/main.rs @@ -1,35 +1,64 @@ -use std::collections::HashMap; - use harmony::{ inventory::Inventory, modules::monitoring::{ - alert_channel::discord_alert_channel::DiscordWebhook, - okd::cluster_monitoring::OpenshiftClusterAlertScore, + alert_channel::discord_alert_channel::DiscordReceiver, + alert_rule::{ + alerts::{ + infra::opnsense::high_http_error_rate, k8s::pvc::high_pvc_fill_rate_over_two_days, + }, + prometheus_alert_rule::AlertManagerRuleGroup, + }, + okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore, + scrape_target::prometheus_node_exporter::PrometheusNodeExporter, + }, + topology::{ + K8sAnywhereTopology, + monitoring::{AlertMatcher, AlertRoute, MatchOp}, }, - topology::K8sAnywhereTopology, }; -use harmony_macros::hurl; -use harmony_types::k8s_name::K8sName; + +use harmony_macros::{hurl, ip}; #[tokio::main] async fn main() { - let mut sel = HashMap::new(); - sel.insert( - "openshift_io_alert_source".to_string(), - "platform".to_string(), - ); - let mut sel2 = HashMap::new(); - sel2.insert("openshift_io_alert_source".to_string(), "".to_string()); - let selectors = vec![sel, sel2]; + let platform_matcher = AlertMatcher { + label: "prometheus".to_string(), + operator: MatchOp::Eq, + value: "openshift-monitoring/k8s".to_string(), + }; + let severity = AlertMatcher { + label: "severity".to_string(), + operator: MatchOp::Eq, + value: "critical".to_string(), + }; + + let high_http_error_rate = high_http_error_rate(); + + let additional_rules = AlertManagerRuleGroup::new("test-rule", vec![high_http_error_rate]); + + let scrape_target = PrometheusNodeExporter { + job_name: "firewall".to_string(), + metrics_path: "/metrics".to_string(), + listen_address: ip!("192.168.1.1"), + port: 9100, + ..Default::default() + }; + harmony_cli::run( Inventory::autoload(), K8sAnywhereTopology::from_env(), vec![Box::new(OpenshiftClusterAlertScore { - receivers: vec![Box::new(DiscordWebhook { - name: K8sName("wills-discord-webhook-example".to_string()), - url: hurl!("https://something.io"), - selectors: selectors, + receivers: vec![Box::new(DiscordReceiver { + name: "crit-wills-discord-channel-example".to_string(), + url: hurl!("https://test.io"), + route: AlertRoute { + matchers: vec![severity], + ..AlertRoute::default("crit-wills-discord-channel-example".to_string()) + }, })], + sender: harmony::modules::monitoring::okd::OpenshiftClusterAlertSender, + rules: vec![Box::new(additional_rules)], + scrape_targets: Some(vec![Box::new(scrape_target)]), })], None, ) diff --git a/examples/rhob_application_monitoring/src/main.rs b/examples/rhob_application_monitoring/src/main.rs index a2596fa0..1d2df53d 100644 --- a/examples/rhob_application_monitoring/src/main.rs +++ b/examples/rhob_application_monitoring/src/main.rs @@ -6,9 +6,9 @@ use harmony::{ application::{ ApplicationScore, RustWebFramework, RustWebapp, features::rhob_monitoring::Monitoring, }, - monitoring::alert_channel::discord_alert_channel::DiscordWebhook, + monitoring::alert_channel::discord_alert_channel::DiscordReceiver, }, - topology::K8sAnywhereTopology, + topology::{K8sAnywhereTopology, monitoring::AlertRoute}, }; use harmony_types::{k8s_name::K8sName, net::Url}; @@ -22,18 +22,21 @@ async fn main() { service_port: 3000, }); - let discord_receiver = DiscordWebhook { - name: K8sName("test-discord".to_string()), + let receiver_name = "test-discord".to_string(); + let discord_receiver = DiscordReceiver { + name: receiver_name.clone(), url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), - selectors: vec![], + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; let app = ApplicationScore { features: vec![ - Box::new(Monitoring { - application: application.clone(), - alert_receiver: vec![Box::new(discord_receiver)], - }), + // Box::new(Monitoring { + // application: application.clone(), + // alert_receiver: vec![Box::new(discord_receiver)], + // }), // TODO add backups, multisite ha, etc ], application, diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs index f100b41f..e66ee235 100644 --- a/examples/rust/src/main.rs +++ b/examples/rust/src/main.rs @@ -8,13 +8,13 @@ use harmony::{ features::{Monitoring, PackagingDeployment}, }, monitoring::alert_channel::{ - discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver, + discord_alert_channel::DiscordReceiver, webhook_receiver::WebhookReceiver, }, }, - topology::K8sAnywhereTopology, + topology::{K8sAnywhereTopology, monitoring::AlertRoute}, }; use harmony_macros::hurl; -use harmony_types::k8s_name::K8sName; +use harmony_types::{k8s_name::K8sName, net::Url}; #[tokio::main] async fn main() { @@ -26,15 +26,23 @@ async fn main() { service_port: 3000, }); - let discord_receiver = DiscordWebhook { - name: K8sName("test-discord".to_string()), - url: hurl!("https://discord.doesnt.exist.com"), - selectors: vec![], + let receiver_name = "test-discord".to_string(); + let discord_receiver = DiscordReceiver { + name: receiver_name.clone(), + url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()), + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; + let receiver_name = "sample-webhook-receiver".to_string(); + let webhook_receiver = WebhookReceiver { - name: "sample-webhook-receiver".to_string(), + name: receiver_name.clone(), url: hurl!("https://webhook-doesnt-exist.com"), + route: AlertRoute { + ..AlertRoute::default(receiver_name) + }, }; let app = ApplicationScore { @@ -42,10 +50,10 @@ async fn main() { Box::new(PackagingDeployment { application: application.clone(), }), - Box::new(Monitoring { - application: application.clone(), - alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)], - }), + // Box::new(Monitoring { + // application: application.clone(), + // alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)], + // }), // TODO add backups, multisite ha, etc ], application, diff --git a/examples/try_rust_webapp/src/main.rs b/examples/try_rust_webapp/src/main.rs index d3c88b32..aa954f63 100644 --- a/examples/try_rust_webapp/src/main.rs +++ b/examples/try_rust_webapp/src/main.rs @@ -1,11 +1,8 @@ use harmony::{ inventory::Inventory, - modules::{ - application::{ - ApplicationScore, RustWebFramework, RustWebapp, - features::{Monitoring, PackagingDeployment}, - }, - monitoring::alert_channel::discord_alert_channel::DiscordWebhook, + modules::application::{ + ApplicationScore, RustWebFramework, RustWebapp, + features::{Monitoring, PackagingDeployment}, }, topology::K8sAnywhereTopology, }; @@ -30,14 +27,14 @@ async fn main() { Box::new(PackagingDeployment { application: application.clone(), }), - Box::new(Monitoring { - application: application.clone(), - alert_receiver: vec![Box::new(DiscordWebhook { - name: K8sName("test-discord".to_string()), - url: hurl!("https://discord.doesnt.exist.com"), - selectors: vec![], - })], - }), + // Box::new(Monitoring { + // application: application.clone(), + // alert_receiver: vec![Box::new(DiscordWebhook { + // name: K8sName("test-discord".to_string()), + // url: hurl!("https://discord.doesnt.exist.com"), + // selectors: vec![], + // })], + // }), ], application, }; diff --git a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs index 0f93a182..61bae5c6 100644 --- a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs +++ b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration}; +use std::{collections::BTreeMap, process::Command, sync::Arc}; use async_trait::async_trait; use base64::{Engine, engine::general_purpose}; @@ -8,7 +8,7 @@ use k8s_openapi::api::{ core::v1::{Pod, Secret}, rbac::v1::{ClusterRoleBinding, RoleRef, Subject}, }; -use kube::api::{DynamicObject, GroupVersionKind, ObjectMeta}; +use kube::api::{GroupVersionKind, ObjectMeta}; use log::{debug, info, trace, warn}; use serde::Serialize; use tokio::sync::OnceCell; @@ -29,28 +29,7 @@ use crate::{ score_cert_management::CertificateManagementScore, }, k3d::K3DInstallationScore, - k8s::ingress::{K8sIngressScore, PathType}, - monitoring::{ - grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score}, - kube_prometheus::crd::{ - crd_alertmanager_config::CRDPrometheus, - crd_grafana::{ - Grafana as GrafanaCRD, GrafanaCom, GrafanaDashboard, - GrafanaDashboardDatasource, GrafanaDashboardSpec, GrafanaDatasource, - GrafanaDatasourceConfig, GrafanaDatasourceJsonData, - GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec, - }, - crd_prometheuses::LabelSelector, - prometheus_operator::prometheus_operator_helm_chart_score, - rhob_alertmanager_config::RHOBObservability, - service_monitor::ServiceMonitor, - }, - }, okd::{crd::ingresses_config::Ingress as IngressResource, route::OKDTlsPassthroughScore}, - prometheus::{ - k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore, - prometheus::PrometheusMonitoring, rhob_alerting_score::RHOBAlertingScore, - }, }, score::Score, topology::{TlsRoute, TlsRouter, ingress::Ingress}, @@ -59,7 +38,6 @@ use crate::{ use super::super::{ DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError, PreparationOutcome, Topology, - oberservability::monitoring::AlertReceiver, tenant::{ TenantConfig, TenantManager, k8s::K8sTenantManager, @@ -166,216 +144,6 @@ impl TlsRouter for K8sAnywhereTopology { } } -#[async_trait] -impl Grafana for K8sAnywhereTopology { - async fn ensure_grafana_operator( - &self, - inventory: &Inventory, - ) -> Result { - debug!("ensure grafana operator"); - let client = self.k8s_client().await.unwrap(); - let grafana_gvk = GroupVersionKind { - group: "grafana.integreatly.org".to_string(), - version: "v1beta1".to_string(), - kind: "Grafana".to_string(), - }; - let name = "grafanas.grafana.integreatly.org"; - let ns = "grafana"; - - let grafana_crd = client - .get_resource_json_value(name, Some(ns), &grafana_gvk) - .await; - match grafana_crd { - Ok(_) => { - return Ok(PreparationOutcome::Success { - details: "Found grafana CRDs in cluster".to_string(), - }); - } - - Err(_) => { - return self - .install_grafana_operator(inventory, Some("grafana")) - .await; - } - }; - } - async fn install_grafana(&self) -> Result { - let ns = "grafana"; - - let mut label = BTreeMap::new(); - - label.insert("dashboards".to_string(), "grafana".to_string()); - - let label_selector = LabelSelector { - match_labels: label.clone(), - match_expressions: vec![], - }; - - let client = self.k8s_client().await?; - - let grafana = self.build_grafana(ns, &label); - - client.apply(&grafana, Some(ns)).await?; - //TODO change this to a ensure ready or something better than just a timeout - client - .wait_until_deployment_ready( - "grafana-grafana-deployment", - Some("grafana"), - Some(Duration::from_secs(30)), - ) - .await?; - - let sa_name = "grafana-grafana-sa"; - let token_secret_name = "grafana-sa-token-secret"; - - let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns); - - client.apply(&sa_token_secret, Some(ns)).await?; - let secret_gvk = GroupVersionKind { - group: "".to_string(), - version: "v1".to_string(), - kind: "Secret".to_string(), - }; - - let secret = client - .get_resource_json_value(token_secret_name, Some(ns), &secret_gvk) - .await?; - - let token = format!( - "Bearer {}", - self.extract_and_normalize_token(&secret).unwrap() - ); - - debug!("creating grafana clusterrole binding"); - - let clusterrolebinding = - self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns); - - client.apply(&clusterrolebinding, Some(ns)).await?; - - debug!("creating grafana datasource crd"); - - let thanos_url = format!( - "https://{}", - self.get_domain("thanos-querier-openshift-monitoring") - .await - .unwrap() - ); - - let thanos_openshift_datasource = self.build_grafana_datasource( - "thanos-openshift-monitoring", - ns, - &label_selector, - &thanos_url, - &token, - ); - - client.apply(&thanos_openshift_datasource, Some(ns)).await?; - - debug!("creating grafana dashboard crd"); - let dashboard = self.build_grafana_dashboard(ns, &label_selector); - - client.apply(&dashboard, Some(ns)).await?; - debug!("creating grafana ingress"); - let grafana_ingress = self.build_grafana_ingress(ns).await; - - grafana_ingress - .interpret(&Inventory::empty(), self) - .await - .map_err(|e| PreparationError::new(e.to_string()))?; - - Ok(PreparationOutcome::Success { - details: "Installed grafana composants".to_string(), - }) - } -} - -#[async_trait] -impl PrometheusMonitoring for K8sAnywhereTopology { - async fn install_prometheus( - &self, - sender: &CRDPrometheus, - _inventory: &Inventory, - _receivers: Option>>>, - ) -> Result { - let client = self.k8s_client().await?; - - for monitor in sender.service_monitor.iter() { - client - .apply(monitor, Some(&sender.namespace)) - .await - .map_err(|e| PreparationError::new(e.to_string()))?; - } - Ok(PreparationOutcome::Success { - details: "successfuly installed prometheus components".to_string(), - }) - } - - async fn ensure_prometheus_operator( - &self, - sender: &CRDPrometheus, - _inventory: &Inventory, - ) -> Result { - let po_result = self.ensure_prometheus_operator(sender).await?; - - match po_result { - PreparationOutcome::Success { details: _ } => { - debug!("Detected prometheus crds operator present in cluster."); - return Ok(po_result); - } - PreparationOutcome::Noop => { - debug!("Skipping Prometheus CR installation due to missing operator."); - return Ok(po_result); - } - } - } -} - -#[async_trait] -impl PrometheusMonitoring for K8sAnywhereTopology { - async fn install_prometheus( - &self, - sender: &RHOBObservability, - inventory: &Inventory, - receivers: Option>>>, - ) -> Result { - let po_result = self.ensure_cluster_observability_operator(sender).await?; - - if po_result == PreparationOutcome::Noop { - debug!("Skipping Prometheus CR installation due to missing operator."); - return Ok(po_result); - } - - let result = self - .get_cluster_observability_operator_prometheus_application_score( - sender.clone(), - receivers, - ) - .await - .interpret(inventory, self) - .await; - - match result { - Ok(outcome) => match outcome.status { - InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { - details: outcome.message, - }), - InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), - _ => Err(PreparationError::new(outcome.message)), - }, - Err(err) => Err(PreparationError::new(err.to_string())), - } - } - - async fn ensure_prometheus_operator( - &self, - sender: &RHOBObservability, - inventory: &Inventory, - ) -> Result { - todo!() - } -} - impl Serialize for K8sAnywhereTopology { fn serialize(&self, _serializer: S) -> Result where @@ -580,23 +348,6 @@ impl K8sAnywhereTopology { } } - fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option { - let token_b64 = secret - .data - .get("token") - .or_else(|| secret.data.get("data").and_then(|d| d.get("token"))) - .and_then(|v| v.as_str())?; - - let bytes = general_purpose::STANDARD.decode(token_b64).ok()?; - - let s = String::from_utf8(bytes).ok()?; - - let cleaned = s - .trim_matches(|c: char| c.is_whitespace() || c == '\0') - .to_string(); - Some(cleaned) - } - pub async fn get_k8s_distribution(&self) -> Result { self.k8s_client() .await? @@ -656,141 +407,6 @@ impl K8sAnywhereTopology { } } - fn build_grafana_datasource( - &self, - name: &str, - ns: &str, - label_selector: &LabelSelector, - url: &str, - token: &str, - ) -> GrafanaDatasource { - let mut json_data = BTreeMap::new(); - json_data.insert("timeInterval".to_string(), "5s".to_string()); - - GrafanaDatasource { - metadata: ObjectMeta { - name: Some(name.to_string()), - namespace: Some(ns.to_string()), - ..Default::default() - }, - spec: GrafanaDatasourceSpec { - instance_selector: label_selector.clone(), - allow_cross_namespace_import: Some(true), - values_from: None, - datasource: GrafanaDatasourceConfig { - access: "proxy".to_string(), - name: name.to_string(), - r#type: "prometheus".to_string(), - url: url.to_string(), - database: None, - json_data: Some(GrafanaDatasourceJsonData { - time_interval: Some("60s".to_string()), - http_header_name1: Some("Authorization".to_string()), - tls_skip_verify: Some(true), - oauth_pass_thru: Some(true), - }), - secure_json_data: Some(GrafanaDatasourceSecureJsonData { - http_header_value1: Some(format!("Bearer {token}")), - }), - is_default: Some(false), - editable: Some(true), - }, - }, - } - } - - fn build_grafana_dashboard( - &self, - ns: &str, - label_selector: &LabelSelector, - ) -> GrafanaDashboard { - let graf_dashboard = GrafanaDashboard { - metadata: ObjectMeta { - name: Some(format!("grafana-dashboard-{}", ns)), - namespace: Some(ns.to_string()), - ..Default::default() - }, - spec: GrafanaDashboardSpec { - resync_period: Some("30s".to_string()), - instance_selector: label_selector.clone(), - datasources: Some(vec![GrafanaDashboardDatasource { - input_name: "DS_PROMETHEUS".to_string(), - datasource_name: "thanos-openshift-monitoring".to_string(), - }]), - json: None, - grafana_com: Some(GrafanaCom { - id: 17406, - revision: None, - }), - }, - }; - graf_dashboard - } - - fn build_grafana(&self, ns: &str, labels: &BTreeMap) -> GrafanaCRD { - let grafana = GrafanaCRD { - metadata: ObjectMeta { - name: Some(format!("grafana-{}", ns)), - namespace: Some(ns.to_string()), - labels: Some(labels.clone()), - ..Default::default() - }, - spec: GrafanaSpec { - config: None, - admin_user: None, - admin_password: None, - ingress: None, - persistence: None, - resources: None, - }, - }; - grafana - } - - async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore { - let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap(); - let name = format!("{}-grafana", ns); - let backend_service = format!("grafana-{}-service", ns); - - K8sIngressScore { - name: fqdn::fqdn!(&name), - host: fqdn::fqdn!(&domain), - backend_service: fqdn::fqdn!(&backend_service), - port: 3000, - path: Some("/".to_string()), - path_type: Some(PathType::Prefix), - namespace: Some(fqdn::fqdn!(&ns)), - ingress_class_name: Some("openshift-default".to_string()), - } - } - - async fn get_cluster_observability_operator_prometheus_application_score( - &self, - sender: RHOBObservability, - receivers: Option>>>, - ) -> RHOBAlertingScore { - RHOBAlertingScore { - sender, - receivers: receivers.unwrap_or_default(), - service_monitors: vec![], - prometheus_rules: vec![], - } - } - - async fn get_k8s_prometheus_application_score( - &self, - sender: CRDPrometheus, - receivers: Option>>>, - service_monitors: Option>, - ) -> K8sPrometheusCRDAlertingScore { - return K8sPrometheusCRDAlertingScore { - sender, - receivers: receivers.unwrap_or_default(), - service_monitors: service_monitors.unwrap_or_default(), - prometheus_rules: vec![], - }; - } - async fn openshift_ingress_operator_available(&self) -> Result<(), PreparationError> { let client = self.k8s_client().await?; let gvk = GroupVersionKind { @@ -956,137 +572,6 @@ impl K8sAnywhereTopology { )), } } - - async fn ensure_cluster_observability_operator( - &self, - sender: &RHOBObservability, - ) -> Result { - let status = Command::new("sh") - .args(["-c", "kubectl get crd -A | grep -i rhobs"]) - .status() - .map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?; - - if !status.success() { - if let Some(Some(k8s_state)) = self.k8s_state.get() { - match k8s_state.source { - K8sSource::LocalK3d => { - warn!( - "Installing observability operator is not supported on LocalK3d source" - ); - return Ok(PreparationOutcome::Noop); - debug!("installing cluster observability operator"); - todo!(); - let op_score = - prometheus_operator_helm_chart_score(sender.namespace.clone()); - let result = op_score.interpret(&Inventory::empty(), self).await; - - return match result { - Ok(outcome) => match outcome.status { - InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { - details: "installed cluster observability operator".into(), - }), - InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), - _ => Err(PreparationError::new( - "failed to install cluster observability operator (unknown error)".into(), - )), - }, - Err(err) => Err(PreparationError::new(err.to_string())), - }; - } - K8sSource::Kubeconfig => { - debug!( - "unable to install cluster observability operator, contact cluster admin" - ); - return Ok(PreparationOutcome::Noop); - } - } - } else { - warn!( - "Unable to detect k8s_state. Skipping Cluster Observability Operator install." - ); - return Ok(PreparationOutcome::Noop); - } - } - - debug!("Cluster Observability Operator is already present, skipping install"); - - Ok(PreparationOutcome::Success { - details: "cluster observability operator present in cluster".into(), - }) - } - - async fn ensure_prometheus_operator( - &self, - sender: &CRDPrometheus, - ) -> Result { - let status = Command::new("sh") - .args(["-c", "kubectl get crd -A | grep -i prometheuses"]) - .status() - .map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?; - - if !status.success() { - if let Some(Some(k8s_state)) = self.k8s_state.get() { - match k8s_state.source { - K8sSource::LocalK3d => { - debug!("installing prometheus operator"); - let op_score = - prometheus_operator_helm_chart_score(sender.namespace.clone()); - let result = op_score.interpret(&Inventory::empty(), self).await; - - return match result { - Ok(outcome) => match outcome.status { - InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success { - details: "installed prometheus operator".into(), - }), - InterpretStatus::NOOP => Ok(PreparationOutcome::Noop), - _ => Err(PreparationError::new( - "failed to install prometheus operator (unknown error)".into(), - )), - }, - Err(err) => Err(PreparationError::new(err.to_string())), - }; - } - K8sSource::Kubeconfig => { - debug!("unable to install prometheus operator, contact cluster admin"); - return Ok(PreparationOutcome::Noop); - } - } - } else { - warn!("Unable to detect k8s_state. Skipping Prometheus Operator install."); - return Ok(PreparationOutcome::Noop); - } - } - - debug!("Prometheus operator is already present, skipping install"); - - Ok(PreparationOutcome::Success { - details: "prometheus operator present in cluster".into(), - }) - } - - async fn install_grafana_operator( - &self, - inventory: &Inventory, - ns: Option<&str>, - ) -> Result { - let namespace = ns.unwrap_or("grafana"); - info!("installing grafana operator in ns {namespace}"); - let tenant = self.get_k8s_tenant_manager()?.get_tenant_config().await; - let mut namespace_scope = false; - if tenant.is_some() { - namespace_scope = true; - } - let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope) - .interpret(inventory, self) - .await - .map_err(|e| PreparationError::new(e.to_string())); - Ok(PreparationOutcome::Success { - details: format!( - "Successfully installed grafana operator in ns {}", - ns.unwrap() - ), - }) - } } #[derive(Clone, Debug)] diff --git a/harmony/src/domain/topology/k8s_anywhere/mod.rs b/harmony/src/domain/topology/k8s_anywhere/mod.rs index d14c28ba..a10dd38a 100644 --- a/harmony/src/domain/topology/k8s_anywhere/mod.rs +++ b/harmony/src/domain/topology/k8s_anywhere/mod.rs @@ -1,4 +1,5 @@ mod k8s_anywhere; pub mod nats; +pub mod observability; mod postgres; pub use k8s_anywhere::*; diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/grafana.rs b/harmony/src/domain/topology/k8s_anywhere/observability/grafana.rs new file mode 100644 index 00000000..7f06a2af --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/grafana.rs @@ -0,0 +1,147 @@ +use async_trait::async_trait; + +use crate::{ + inventory::Inventory, + modules::monitoring::grafana::{ + grafana::Grafana, + k8s::{ + score_ensure_grafana_ready::GrafanaK8sEnsureReadyScore, + score_grafana_alert_receiver::GrafanaK8sReceiverScore, + score_grafana_datasource::GrafanaK8sDatasourceScore, + score_grafana_rule::GrafanaK8sRuleScore, score_install_grafana::GrafanaK8sInstallScore, + }, + }, + score::Score, + topology::{ + K8sAnywhereTopology, PreparationError, PreparationOutcome, + monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget}, + }, +}; + +#[async_trait] +impl Observability for K8sAnywhereTopology { + async fn install_alert_sender( + &self, + sender: &Grafana, + inventory: &Inventory, + ) -> Result { + let score = GrafanaK8sInstallScore { + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Grafana not installed {}", e)))?; + Ok(PreparationOutcome::Success { + details: "Successfully installed grafana alert sender".to_string(), + }) + } + + async fn install_receivers( + &self, + sender: &Grafana, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let receivers = match receivers { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for receiver in receivers { + let score = GrafanaK8sReceiverScore { + receiver, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert receivers installed successfully".to_string(), + }) + } + + async fn install_rules( + &self, + sender: &Grafana, + inventory: &Inventory, + rules: Option>>>, + ) -> Result { + let rules = match rules { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for rule in rules { + let score = GrafanaK8sRuleScore { + sender: sender.clone(), + rule, + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert rules installed successfully".to_string(), + }) + } + + async fn add_scrape_targets( + &self, + sender: &Grafana, + inventory: &Inventory, + scrape_targets: Option>>>, + ) -> Result { + let scrape_targets = match scrape_targets { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for scrape_target in scrape_targets { + let score = GrafanaK8sDatasourceScore { + scrape_target, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to add DataSource: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All datasources installed successfully".to_string(), + }) + } + + async fn ensure_monitoring_installed( + &self, + sender: &Grafana, + inventory: &Inventory, + ) -> Result { + let score = GrafanaK8sEnsureReadyScore { + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Grafana not ready {}", e)))?; + Ok(PreparationOutcome::Success { + details: "Grafana Ready".to_string(), + }) + } +} diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/kube_prometheus.rs b/harmony/src/domain/topology/k8s_anywhere/observability/kube_prometheus.rs new file mode 100644 index 00000000..05cd3f6b --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/kube_prometheus.rs @@ -0,0 +1,142 @@ +use async_trait::async_trait; + +use crate::{ + inventory::Inventory, + modules::monitoring::kube_prometheus::{ + KubePrometheus, helm::kube_prometheus_helm_chart::kube_prometheus_helm_chart_score, + score_kube_prometheus_alert_receivers::KubePrometheusReceiverScore, + score_kube_prometheus_ensure_ready::KubePrometheusEnsureReadyScore, + score_kube_prometheus_rule::KubePrometheusRuleScore, + score_kube_prometheus_scrape_target::KubePrometheusScrapeTargetScore, + }, + score::Score, + topology::{ + K8sAnywhereTopology, PreparationError, PreparationOutcome, + monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget}, + }, +}; + +#[async_trait] +impl Observability for K8sAnywhereTopology { + async fn install_alert_sender( + &self, + sender: &KubePrometheus, + inventory: &Inventory, + ) -> Result { + kube_prometheus_helm_chart_score(sender.config.clone()) + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; + + Ok(PreparationOutcome::Success { + details: "Successfully installed kubeprometheus alert sender".to_string(), + }) + } + + async fn install_receivers( + &self, + sender: &KubePrometheus, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let receivers = match receivers { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for receiver in receivers { + let score = KubePrometheusReceiverScore { + receiver, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert receivers installed successfully".to_string(), + }) + } + + async fn install_rules( + &self, + sender: &KubePrometheus, + inventory: &Inventory, + rules: Option>>>, + ) -> Result { + let rules = match rules { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for rule in rules { + let score = KubePrometheusRuleScore { + sender: sender.clone(), + rule, + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert rules installed successfully".to_string(), + }) + } + + async fn add_scrape_targets( + &self, + sender: &KubePrometheus, + inventory: &Inventory, + scrape_targets: Option>>>, + ) -> Result { + let scrape_targets = match scrape_targets { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for scrape_target in scrape_targets { + let score = KubePrometheusScrapeTargetScore { + scrape_target, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All scrap targets installed successfully".to_string(), + }) + } + + async fn ensure_monitoring_installed( + &self, + sender: &KubePrometheus, + inventory: &Inventory, + ) -> Result { + let score = KubePrometheusEnsureReadyScore { + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("KubePrometheus not ready {}", e)))?; + Ok(PreparationOutcome::Success { + details: "KubePrometheus Ready".to_string(), + }) + } +} diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/mod.rs b/harmony/src/domain/topology/k8s_anywhere/observability/mod.rs new file mode 100644 index 00000000..6e105557 --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/mod.rs @@ -0,0 +1,5 @@ +pub mod grafana; +pub mod kube_prometheus; +pub mod openshift_monitoring; +pub mod prometheus; +pub mod redhat_cluster_observability; diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/openshift_monitoring.rs b/harmony/src/domain/topology/k8s_anywhere/observability/openshift_monitoring.rs new file mode 100644 index 00000000..aa42a220 --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/openshift_monitoring.rs @@ -0,0 +1,142 @@ +use async_trait::async_trait; +use log::info; + +use crate::score::Score; +use crate::{ + inventory::Inventory, + modules::monitoring::okd::{ + OpenshiftClusterAlertSender, + score_enable_cluster_monitoring::OpenshiftEnableClusterMonitoringScore, + score_openshift_alert_rule::OpenshiftAlertRuleScore, + score_openshift_receiver::OpenshiftReceiverScore, + score_openshift_scrape_target::OpenshiftScrapeTargetScore, + score_user_workload::OpenshiftUserWorkloadMonitoring, + score_verify_user_workload_monitoring::VerifyUserWorkload, + }, + topology::{ + K8sAnywhereTopology, PreparationError, PreparationOutcome, + monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget}, + }, +}; + +#[async_trait] +impl Observability for K8sAnywhereTopology { + async fn install_alert_sender( + &self, + _sender: &OpenshiftClusterAlertSender, + inventory: &Inventory, + ) -> Result { + info!("enabling cluster monitoring"); + let cluster_monitoring_score = OpenshiftEnableClusterMonitoringScore {}; + cluster_monitoring_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + + info!("enabling user workload monitoring"); + let user_workload_score = OpenshiftUserWorkloadMonitoring {}; + user_workload_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + + Ok(PreparationOutcome::Success { + details: "Successfully configured cluster monitoring".to_string(), + }) + } + + async fn install_receivers( + &self, + _sender: &OpenshiftClusterAlertSender, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + if let Some(receivers) = receivers { + for receiver in receivers { + info!("Installing receiver {}", receiver.name()); + let receiver_score = OpenshiftReceiverScore { receiver }; + receiver_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + } + Ok(PreparationOutcome::Success { + details: "Successfully installed receivers for OpenshiftClusterMonitoring" + .to_string(), + }) + } else { + Ok(PreparationOutcome::Noop) + } + } + + async fn install_rules( + &self, + _sender: &OpenshiftClusterAlertSender, + inventory: &Inventory, + rules: Option>>>, + ) -> Result { + if let Some(rules) = rules { + for rule in rules { + info!("Installing rule "); + let rule_score = OpenshiftAlertRuleScore { rule: rule }; + rule_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + } + Ok(PreparationOutcome::Success { + details: "Successfully installed rules for OpenshiftClusterMonitoring".to_string(), + }) + } else { + Ok(PreparationOutcome::Noop) + } + } + + async fn add_scrape_targets( + &self, + _sender: &OpenshiftClusterAlertSender, + inventory: &Inventory, + scrape_targets: Option>>>, + ) -> Result { + if let Some(scrape_targets) = scrape_targets { + for scrape_target in scrape_targets { + info!("Installing scrape target"); + let scrape_target_score = OpenshiftScrapeTargetScore { + scrape_target: scrape_target, + }; + scrape_target_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + } + Ok(PreparationOutcome::Success { + details: "Successfully added scrape targets for OpenshiftClusterMonitoring" + .to_string(), + }) + } else { + Ok(PreparationOutcome::Noop) + } + } + + async fn ensure_monitoring_installed( + &self, + _sender: &OpenshiftClusterAlertSender, + inventory: &Inventory, + ) -> Result { + let verify_monitoring_score = VerifyUserWorkload {}; + info!("Verifying user workload and cluster monitoring installed"); + verify_monitoring_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError { msg: e.to_string() })?; + Ok(PreparationOutcome::Success { + details: "OpenshiftClusterMonitoring ready".to_string(), + }) + } +} diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/prometheus.rs b/harmony/src/domain/topology/k8s_anywhere/observability/prometheus.rs new file mode 100644 index 00000000..9085a607 --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/prometheus.rs @@ -0,0 +1,147 @@ +use async_trait::async_trait; + +use crate::{ + inventory::Inventory, + modules::monitoring::prometheus::{ + Prometheus, score_prometheus_alert_receivers::PrometheusReceiverScore, + score_prometheus_ensure_ready::PrometheusEnsureReadyScore, + score_prometheus_install::PrometheusInstallScore, + score_prometheus_rule::PrometheusRuleScore, + score_prometheus_scrape_target::PrometheusScrapeTargetScore, + }, + score::Score, + topology::{ + K8sAnywhereTopology, PreparationError, PreparationOutcome, + monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget}, + }, +}; + +#[async_trait] +impl Observability for K8sAnywhereTopology { + async fn install_alert_sender( + &self, + sender: &Prometheus, + inventory: &Inventory, + ) -> Result { + let score = PrometheusInstallScore { + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Prometheus not installed {}", e)))?; + + Ok(PreparationOutcome::Success { + details: "Successfully installed kubeprometheus alert sender".to_string(), + }) + } + + async fn install_receivers( + &self, + sender: &Prometheus, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let receivers = match receivers { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for receiver in receivers { + let score = PrometheusReceiverScore { + receiver, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install receiver: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert receivers installed successfully".to_string(), + }) + } + + async fn install_rules( + &self, + sender: &Prometheus, + inventory: &Inventory, + rules: Option>>>, + ) -> Result { + let rules = match rules { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for rule in rules { + let score = PrometheusRuleScore { + sender: sender.clone(), + rule, + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All alert rules installed successfully".to_string(), + }) + } + + async fn add_scrape_targets( + &self, + sender: &Prometheus, + inventory: &Inventory, + scrape_targets: Option>>>, + ) -> Result { + let scrape_targets = match scrape_targets { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for scrape_target in scrape_targets { + let score = PrometheusScrapeTargetScore { + scrape_target, + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Failed to install rule: {}", e)))?; + } + + Ok(PreparationOutcome::Success { + details: "All scrap targets installed successfully".to_string(), + }) + } + + async fn ensure_monitoring_installed( + &self, + sender: &Prometheus, + inventory: &Inventory, + ) -> Result { + let score = PrometheusEnsureReadyScore { + sender: sender.clone(), + }; + + score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(format!("Prometheus not ready {}", e)))?; + + Ok(PreparationOutcome::Success { + details: "Prometheus Ready".to_string(), + }) + } +} diff --git a/harmony/src/domain/topology/k8s_anywhere/observability/redhat_cluster_observability.rs b/harmony/src/domain/topology/k8s_anywhere/observability/redhat_cluster_observability.rs new file mode 100644 index 00000000..6007640f --- /dev/null +++ b/harmony/src/domain/topology/k8s_anywhere/observability/redhat_cluster_observability.rs @@ -0,0 +1,116 @@ +use crate::{ + modules::monitoring::red_hat_cluster_observability::{ + score_alert_receiver::RedHatClusterObservabilityReceiverScore, + score_coo_monitoring_stack::RedHatClusterObservabilityMonitoringStackScore, + }, + score::Score, +}; +use async_trait::async_trait; +use log::info; + +use crate::{ + inventory::Inventory, + modules::monitoring::red_hat_cluster_observability::{ + RedHatClusterObservability, + score_redhat_cluster_observability_operator::RedHatClusterObservabilityOperatorScore, + }, + topology::{ + K8sAnywhereTopology, PreparationError, PreparationOutcome, + monitoring::{AlertReceiver, AlertRule, Observability, ScrapeTarget}, + }, +}; + +#[async_trait] +impl Observability for K8sAnywhereTopology { + async fn install_alert_sender( + &self, + sender: &RedHatClusterObservability, + inventory: &Inventory, + ) -> Result { + info!("Verifying Redhat Cluster Observability Operator"); + + let coo_score = RedHatClusterObservabilityOperatorScore::default(); + + coo_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; + + info!( + "Installing Cluster Observability Operator Monitoring Stack in ns {}", + sender.namespace.clone() + ); + + let coo_monitoring_stack_score = RedHatClusterObservabilityMonitoringStackScore { + namespace: sender.namespace.clone(), + resource_selector: sender.resource_selector.clone(), + }; + + coo_monitoring_stack_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; + + Ok(PreparationOutcome::Success { + details: "Successfully installed RedHatClusterObservability Operator".to_string(), + }) + } + + async fn install_receivers( + &self, + sender: &RedHatClusterObservability, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result { + let receivers = match receivers { + Some(r) if !r.is_empty() => r, + _ => return Ok(PreparationOutcome::Noop), + }; + + for receiver in receivers { + info!("Installing receiver {}", receiver.name()); + + let receiver_score = RedHatClusterObservabilityReceiverScore { + receiver, + sender: sender.clone(), + }; + receiver_score + .create_interpret() + .execute(inventory, self) + .await + .map_err(|e| PreparationError::new(e.to_string()))?; + } + + Ok(PreparationOutcome::Success { + details: "Successfully installed receivers for OpenshiftClusterMonitoring".to_string(), + }) + } + + async fn install_rules( + &self, + _sender: &RedHatClusterObservability, + _inventory: &Inventory, + _rules: Option>>>, + ) -> Result { + todo!() + } + + async fn add_scrape_targets( + &self, + _sender: &RedHatClusterObservability, + _inventory: &Inventory, + _scrape_targets: Option>>>, + ) -> Result { + todo!() + } + + async fn ensure_monitoring_installed( + &self, + _sender: &RedHatClusterObservability, + _inventory: &Inventory, + ) -> Result { + todo!() + } +} diff --git a/harmony/src/domain/topology/mod.rs b/harmony/src/domain/topology/mod.rs index c661e8ad..d9148918 100644 --- a/harmony/src/domain/topology/mod.rs +++ b/harmony/src/domain/topology/mod.rs @@ -2,6 +2,7 @@ pub mod decentralized; mod failover; mod ha_cluster; pub mod ingress; +pub mod monitoring; pub mod node_exporter; pub mod opnsense; pub use failover::*; @@ -11,7 +12,6 @@ mod http; pub mod installable; mod k8s_anywhere; mod localhost; -pub mod oberservability; pub mod tenant; use derive_new::new; pub use k8s_anywhere::*; diff --git a/harmony/src/domain/topology/monitoring.rs b/harmony/src/domain/topology/monitoring.rs new file mode 100644 index 00000000..41cced41 --- /dev/null +++ b/harmony/src/domain/topology/monitoring.rs @@ -0,0 +1,256 @@ +use std::{ + any::Any, + collections::{BTreeMap, HashMap}, + net::IpAddr, +}; + +use async_trait::async_trait; +use kube::api::DynamicObject; +use log::{debug, info}; +use serde::{Deserialize, Serialize}; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + topology::{PreparationError, PreparationOutcome, Topology, installable::Installable}, +}; +use harmony_types::id::Id; + +/// Defines the application that sends the alerts to a receivers +/// for example prometheus +#[async_trait] +pub trait AlertSender: Send + Sync + std::fmt::Debug { + fn name(&self) -> String; +} + +/// Trait which defines how an alert sender is impleneted for a specific topology +#[async_trait] +pub trait Observability { + async fn install_alert_sender( + &self, + sender: &S, + inventory: &Inventory, + ) -> Result; + + async fn install_receivers( + &self, + sender: &S, + inventory: &Inventory, + receivers: Option>>>, + ) -> Result; + + async fn install_rules( + &self, + sender: &S, + inventory: &Inventory, + rules: Option>>>, + ) -> Result; + + async fn add_scrape_targets( + &self, + sender: &S, + inventory: &Inventory, + scrape_targets: Option>>>, + ) -> Result; + + async fn ensure_monitoring_installed( + &self, + sender: &S, + inventory: &Inventory, + ) -> Result; +} + +/// Defines the entity that receives the alerts from a sender. For example Discord, Slack, etc +/// +pub trait AlertReceiver: std::fmt::Debug + Send + Sync { + fn build(&self) -> Result; + fn name(&self) -> String; + fn clone_box(&self) -> Box>; +} + +/// Defines a generic rule that can be applied to a sender, such as aprometheus alert rule +pub trait AlertRule: std::fmt::Debug + Send + Sync { + fn build_rule(&self) -> Result; + fn name(&self) -> String; + fn clone_box(&self) -> Box>; +} + +/// A generic scrape target that can be added to a sender to scrape metrics from, for example a +/// server outside of the cluster +pub trait ScrapeTarget: std::fmt::Debug + Send + Sync { + fn build_scrape_target(&self) -> Result; + fn name(&self) -> String; + fn clone_box(&self) -> Box>; +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExternalScrapeTarget { + pub ip: IpAddr, + pub port: i32, + pub interval: Option, + pub path: Option, + pub labels: Option>, +} + +/// Alerting interpret to install an alert sender on a given topology +#[derive(Debug)] +pub struct AlertingInterpret { + pub sender: S, + pub receivers: Vec>>, + pub rules: Vec>>, + pub scrape_targets: Option>>>, +} + +#[async_trait] +impl> Interpret for AlertingInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + info!("Configuring alert sender {}", self.sender.name()); + topology + .install_alert_sender(&self.sender, inventory) + .await?; + + info!("Installing receivers"); + topology + .install_receivers(&self.sender, inventory, Some(self.receivers.clone())) + .await?; + + info!("Installing rules"); + topology + .install_rules(&self.sender, inventory, Some(self.rules.clone())) + .await?; + + info!("Adding extra scrape targets"); + topology + .add_scrape_targets(&self.sender, inventory, self.scrape_targets.clone()) + .await?; + + info!("Ensuring alert sender {} is ready", self.sender.name()); + topology + .ensure_monitoring_installed(&self.sender, inventory) + .await?; + + Ok(Outcome::success(format!( + "successfully installed alert sender {}", + self.sender.name() + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Alerting + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +impl Clone for Box> { + fn clone(&self) -> Self { + self.clone_box() + } +} + +pub struct ReceiverInstallPlan { + pub install_operation: Option>, + pub route: Option, + pub receiver: Option, +} + +impl Default for ReceiverInstallPlan { + fn default() -> Self { + Self { + install_operation: None, + route: None, + receiver: None, + } + } +} + +pub enum InstallOperation { + CreateSecret { + name: String, + data: BTreeMap, + }, +} + +///Generic routing that can map to various alert sender backends +#[derive(Debug, Clone, Serialize)] +pub struct AlertRoute { + pub receiver: String, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub matchers: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub group_by: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub repeat_interval: Option, + #[serde(rename = "continue")] + pub continue_matching: bool, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub children: Vec, +} + +impl AlertRoute { + pub fn default(name: String) -> Self { + Self { + receiver: name, + matchers: vec![], + group_by: vec![], + repeat_interval: Some("30s".to_string()), + continue_matching: true, + children: vec![], + } + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct AlertMatcher { + pub label: String, + pub operator: MatchOp, + pub value: String, +} + +#[derive(Debug, Clone)] +pub enum MatchOp { + Eq, + NotEq, + Regex, +} + +impl Serialize for MatchOp { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let op = match self { + MatchOp::Eq => "=", + MatchOp::NotEq => "!=", + MatchOp::Regex => "=~", + }; + serializer.serialize_str(op) + } +} diff --git a/harmony/src/domain/topology/oberservability/mod.rs b/harmony/src/domain/topology/oberservability/mod.rs deleted file mode 100644 index 7f2ac952..00000000 --- a/harmony/src/domain/topology/oberservability/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod monitoring; diff --git a/harmony/src/domain/topology/oberservability/monitoring.rs b/harmony/src/domain/topology/oberservability/monitoring.rs deleted file mode 100644 index 78bb1419..00000000 --- a/harmony/src/domain/topology/oberservability/monitoring.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::{any::Any, collections::HashMap}; - -use async_trait::async_trait; -use kube::api::DynamicObject; -use log::debug; - -use crate::{ - data::Version, - interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, - inventory::Inventory, - topology::{Topology, installable::Installable}, -}; -use harmony_types::id::Id; - -#[async_trait] -pub trait AlertSender: Send + Sync + std::fmt::Debug { - fn name(&self) -> String; -} - -#[derive(Debug)] -pub struct AlertingInterpret { - pub sender: S, - pub receivers: Vec>>, - pub rules: Vec>>, - pub scrape_targets: Option>>>, -} - -#[async_trait] -impl, T: Topology> Interpret for AlertingInterpret { - async fn execute( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result { - debug!("hit sender configure for AlertingInterpret"); - self.sender.configure(inventory, topology).await?; - for receiver in self.receivers.iter() { - receiver.install(&self.sender).await?; - } - for rule in self.rules.iter() { - debug!("installing rule: {:#?}", rule); - rule.install(&self.sender).await?; - } - if let Some(targets) = &self.scrape_targets { - for target in targets.iter() { - debug!("installing scrape_target: {:#?}", target); - target.install(&self.sender).await?; - } - } - self.sender.ensure_installed(inventory, topology).await?; - Ok(Outcome::success(format!( - "successfully installed alert sender {}", - self.sender.name() - ))) - } - - fn get_name(&self) -> InterpretName { - InterpretName::Alerting - } - - fn get_version(&self) -> Version { - todo!() - } - - fn get_status(&self) -> InterpretStatus { - todo!() - } - - fn get_children(&self) -> Vec { - todo!() - } -} - -#[async_trait] -pub trait AlertReceiver: std::fmt::Debug + Send + Sync { - async fn install(&self, sender: &S) -> Result; - fn name(&self) -> String; - fn clone_box(&self) -> Box>; - fn as_any(&self) -> &dyn Any; - fn as_alertmanager_receiver(&self) -> Result; -} - -#[derive(Debug)] -pub struct AlertManagerReceiver { - pub receiver_config: serde_json::Value, - // FIXME we should not leak k8s here. DynamicObject is k8s specific - pub additional_ressources: Vec, - pub route_config: serde_json::Value, -} - -#[async_trait] -pub trait AlertRule: std::fmt::Debug + Send + Sync { - async fn install(&self, sender: &S) -> Result; - fn clone_box(&self) -> Box>; -} - -#[async_trait] -pub trait ScrapeTarget: std::fmt::Debug + Send + Sync { - async fn install(&self, sender: &S) -> Result; - fn clone_box(&self) -> Box>; -} diff --git a/harmony/src/modules/application/features/monitoring.rs b/harmony/src/modules/application/features/monitoring.rs index 6f5bb214..1bb649fd 100644 --- a/harmony/src/modules/application/features/monitoring.rs +++ b/harmony/src/modules/application/features/monitoring.rs @@ -2,13 +2,15 @@ use crate::modules::application::{ Application, ApplicationFeature, InstallationError, InstallationOutcome, }; use crate::modules::monitoring::application_monitoring::application_monitoring_score::ApplicationMonitoringScore; -use crate::modules::monitoring::grafana::grafana::Grafana; -use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus; use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{ ServiceMonitor, ServiceMonitorSpec, }; +use crate::modules::monitoring::prometheus::Prometheus; +use crate::modules::monitoring::prometheus::helm::prometheus_config::PrometheusConfig; use crate::topology::MultiTargetTopology; use crate::topology::ingress::Ingress; +use crate::topology::monitoring::Observability; +use crate::topology::monitoring::{AlertReceiver, AlertRoute}; use crate::{ inventory::Inventory, modules::monitoring::{ @@ -17,10 +19,6 @@ use crate::{ score::Score, topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager}, }; -use crate::{ - modules::prometheus::prometheus::PrometheusMonitoring, - topology::oberservability::monitoring::AlertReceiver, -}; use async_trait::async_trait; use base64::{Engine as _, engine::general_purpose}; use harmony_secret::SecretManager; @@ -30,12 +28,13 @@ use kube::api::ObjectMeta; use log::{debug, info}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; +//TODO test this #[derive(Debug, Clone)] pub struct Monitoring { pub application: Arc, - pub alert_receiver: Vec>>, + pub alert_receiver: Vec>>, } #[async_trait] @@ -46,8 +45,7 @@ impl< + TenantManager + K8sclient + MultiTargetTopology - + PrometheusMonitoring - + Grafana + + Observability + Ingress + std::fmt::Debug, > ApplicationFeature for Monitoring @@ -74,17 +72,15 @@ impl< }; let mut alerting_score = ApplicationMonitoringScore { - sender: CRDPrometheus { - namespace: namespace.clone(), - client: topology.k8s_client().await.unwrap(), - service_monitor: vec![app_service_monitor], + sender: Prometheus { + config: Arc::new(Mutex::new(PrometheusConfig::new())), }, application: self.application.clone(), receivers: self.alert_receiver.clone(), }; let ntfy = NtfyScore { namespace: namespace.clone(), - host: domain, + host: domain.clone(), }; ntfy.interpret(&Inventory::empty(), topology) .await @@ -105,20 +101,28 @@ impl< debug!("ntfy_default_auth_param: {ntfy_default_auth_param}"); + debug!("ntfy_default_auth_param: {ntfy_default_auth_param}"); let ntfy_receiver = WebhookReceiver { name: "ntfy-webhook".to_string(), url: Url::Url( url::Url::parse( format!( - "http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}", - namespace.clone() + "http://{domain}/{}?auth={ntfy_default_auth_param}", + __self.application.name() ) .as_str(), ) .unwrap(), ), + route: AlertRoute { + ..AlertRoute::default("ntfy-webhook".to_string()) + }, }; - + debug!( + "ntfy webhook receiver \n{:#?}\nntfy topic: {}", + ntfy_receiver.clone(), + self.application.name() + ); alerting_score.receivers.push(Box::new(ntfy_receiver)); alerting_score .interpret(&Inventory::empty(), topology) diff --git a/harmony/src/modules/application/features/rhob_monitoring.rs b/harmony/src/modules/application/features/rhob_monitoring.rs index 60c7aca9..3f33b394 100644 --- a/harmony/src/modules/application/features/rhob_monitoring.rs +++ b/harmony/src/modules/application/features/rhob_monitoring.rs @@ -3,11 +3,13 @@ use std::sync::Arc; use crate::modules::application::{ Application, ApplicationFeature, InstallationError, InstallationOutcome, }; -use crate::modules::monitoring::application_monitoring::rhobs_application_monitoring_score::ApplicationRHOBMonitoringScore; -use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; +use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability; +use crate::modules::monitoring::red_hat_cluster_observability::redhat_cluster_observability::RedHatClusterObservabilityScore; use crate::topology::MultiTargetTopology; use crate::topology::ingress::Ingress; +use crate::topology::monitoring::Observability; +use crate::topology::monitoring::{AlertReceiver, AlertRoute}; use crate::{ inventory::Inventory, modules::monitoring::{ @@ -16,10 +18,6 @@ use crate::{ score::Score, topology::{HelmCommand, K8sclient, Topology, tenant::TenantManager}, }; -use crate::{ - modules::prometheus::prometheus::PrometheusMonitoring, - topology::oberservability::monitoring::AlertReceiver, -}; use async_trait::async_trait; use base64::{Engine as _, engine::general_purpose}; use harmony_types::net::Url; @@ -28,9 +26,10 @@ use log::{debug, info}; #[derive(Debug, Clone)] pub struct Monitoring { pub application: Arc, - pub alert_receiver: Vec>>, + pub alert_receiver: Vec>>, } +///TODO TEST this #[async_trait] impl< T: Topology @@ -41,7 +40,7 @@ impl< + MultiTargetTopology + Ingress + std::fmt::Debug - + PrometheusMonitoring, + + Observability, > ApplicationFeature for Monitoring { async fn ensure_installed( @@ -55,13 +54,14 @@ impl< .map(|ns| ns.name.clone()) .unwrap_or_else(|| self.application.name()); - let mut alerting_score = ApplicationRHOBMonitoringScore { - sender: RHOBObservability { + let mut alerting_score = RedHatClusterObservabilityScore { + sender: RedHatClusterObservability { namespace: namespace.clone(), - client: topology.k8s_client().await.unwrap(), + resource_selector: todo!(), }, - application: self.application.clone(), receivers: self.alert_receiver.clone(), + rules: vec![], + scrape_targets: None, }; let domain = topology .get_domain("ntfy") @@ -97,12 +97,15 @@ impl< url::Url::parse( format!( "http://{domain}/{}?auth={ntfy_default_auth_param}", - self.application.name() + __self.application.name() ) .as_str(), ) .unwrap(), ), + route: AlertRoute { + ..AlertRoute::default("ntfy-webhook".to_string()) + }, }; debug!( "ntfy webhook receiver \n{:#?}\nntfy topic: {}", diff --git a/harmony/src/modules/k8s/resource.rs b/harmony/src/modules/k8s/resource.rs index bff8183f..44620280 100644 --- a/harmony/src/modules/k8s/resource.rs +++ b/harmony/src/modules/k8s/resource.rs @@ -1,5 +1,4 @@ use async_trait::async_trait; -use k8s_openapi::ResourceScope; use kube::Resource; use log::info; use serde::{Serialize, de::DeserializeOwned}; @@ -29,7 +28,7 @@ impl K8sResourceScore { } impl< - K: Resource + K: Resource + std::fmt::Debug + Sync + DeserializeOwned @@ -61,7 +60,7 @@ pub struct K8sResourceInterpret { #[async_trait] impl< - K: Resource + K: Resource + Clone + std::fmt::Debug + DeserializeOwned diff --git a/harmony/src/modules/mod.rs b/harmony/src/modules/mod.rs index 70ecfdf6..caf4280b 100644 --- a/harmony/src/modules/mod.rs +++ b/harmony/src/modules/mod.rs @@ -20,7 +20,6 @@ pub mod okd; pub mod openbao; pub mod opnsense; pub mod postgresql; -pub mod prometheus; pub mod storage; pub mod tenant; pub mod tftp; diff --git a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs index 94627896..cab3c300 100644 --- a/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs +++ b/harmony/src/modules/monitoring/alert_channel/discord_alert_channel.rs @@ -1,99 +1,38 @@ -use std::any::Any; -use std::collections::{BTreeMap, HashMap}; - -use async_trait::async_trait; -use harmony_types::k8s_name::K8sName; -use k8s_openapi::api::core::v1::Secret; -use kube::Resource; -use kube::api::{DynamicObject, ObjectMeta}; -use log::{debug, trace}; +use crate::modules::monitoring::kube_prometheus::KubePrometheus; +use crate::modules::monitoring::okd::OpenshiftClusterAlertSender; +use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability; +use crate::topology::monitoring::{AlertRoute, InstallOperation, ReceiverInstallPlan}; +use crate::{interpret::InterpretError, topology::monitoring::AlertReceiver}; +use harmony_types::net::Url; use serde::Serialize; use serde_json::json; -use serde_yaml::{Mapping, Value}; - -use crate::infra::kube::kube_resource_to_dynamic; -use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::{ - AlertmanagerConfig, AlertmanagerConfigSpec, CRDPrometheus, -}; -use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability; -use crate::modules::monitoring::okd::OpenshiftClusterAlertSender; -use crate::topology::oberservability::monitoring::AlertManagerReceiver; -use crate::{ - interpret::{InterpretError, Outcome}, - modules::monitoring::{ - kube_prometheus::{ - prometheus::{KubePrometheus, KubePrometheusReceiver}, - types::{AlertChannelConfig, AlertManagerChannelConfig}, - }, - prometheus::prometheus::{Prometheus, PrometheusReceiver}, - }, - topology::oberservability::monitoring::AlertReceiver, -}; -use harmony_types::net::Url; +use std::collections::BTreeMap; #[derive(Debug, Clone, Serialize)] -pub struct DiscordWebhook { - pub name: K8sName, +pub struct DiscordReceiver { + pub name: String, pub url: Url, - pub selectors: Vec>, + pub route: AlertRoute, } -impl DiscordWebhook { - fn get_receiver_config(&self) -> Result { - let secret_name = format!("{}-secret", self.name.clone()); - let webhook_key = format!("{}", self.url.clone()); +impl AlertReceiver for DiscordReceiver { + fn build(&self) -> Result { + let receiver_block = serde_yaml::to_value(json!({ + "name": self.name, + "discord_configs": [{ + "webhook_url": format!("{}", self.url), + "title": "{{ template \"discord.default.title\" . }}", + "message": "{{ template \"discord.default.message\" . }}" + }] + })) + .map_err(|e| InterpretError::new(e.to_string()))?; - let mut string_data = BTreeMap::new(); - string_data.insert("webhook-url".to_string(), webhook_key.clone()); - - let secret = Secret { - metadata: kube::core::ObjectMeta { - name: Some(secret_name.clone()), - ..Default::default() - }, - string_data: Some(string_data), - type_: Some("Opaque".to_string()), - ..Default::default() - }; - - let mut matchers: Vec = Vec::new(); - for selector in &self.selectors { - trace!("selector: {:#?}", selector); - for (k, v) in selector { - matchers.push(format!("{} = {}", k, v)); - } - } - - Ok(AlertManagerReceiver { - additional_ressources: vec![kube_resource_to_dynamic(&secret)?], - - receiver_config: json!({ - "name": self.name, - "discord_configs": [ - { - "webhook_url": self.url.clone(), - "title": "{{ template \"discord.default.title\" . }}", - "message": "{{ template \"discord.default.message\" . }}" - } - ] - }), - route_config: json!({ - "receiver": self.name, - "matchers": matchers, - - }), + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver_block), }) } -} - -#[async_trait] -impl AlertReceiver for DiscordWebhook { - async fn install( - &self, - sender: &OpenshiftClusterAlertSender, - ) -> Result { - todo!() - } fn name(&self) -> String { self.name.clone().to_string() @@ -102,309 +41,77 @@ impl AlertReceiver for DiscordWebhook { fn clone_box(&self) -> Box> { Box::new(self.clone()) } - - fn as_any(&self) -> &dyn Any { - todo!() - } - - fn as_alertmanager_receiver(&self) -> Result { - self.get_receiver_config() - } } -#[async_trait] -impl AlertReceiver for DiscordWebhook { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - - async fn install(&self, sender: &RHOBObservability) -> Result { - let ns = sender.namespace.clone(); - - let config = self.get_receiver_config()?; - for resource in config.additional_ressources.iter() { - todo!("can I apply a dynamicresource"); - // sender.client.apply(resource, Some(&ns)).await; - } - - let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec { - data: json!({ - "route": { - "receiver": self.name, - }, - "receivers": [ - config.receiver_config - ] - }), - }; - - let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig { - metadata: ObjectMeta { - name: Some(self.name.clone().to_string()), - labels: Some(std::collections::BTreeMap::from([( - "alertmanagerConfig".to_string(), - "enabled".to_string(), - )])), - namespace: Some(sender.namespace.clone()), - ..Default::default() - }, - spec, - }; - debug!( - "alertmanager_configs yaml:\n{:#?}", - serde_yaml::to_string(&alertmanager_configs) - ); - debug!( - "alert manager configs: \n{:#?}", - alertmanager_configs.clone() - ); - - sender - .client - .apply(&alertmanager_configs, Some(&sender.namespace)) - .await?; - Ok(Outcome::success(format!( - "installed rhob-alertmanagerconfigs for {}", - self.name - ))) - } - - fn name(&self) -> String { - "webhook-receiver".to_string() - } - - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl AlertReceiver for DiscordWebhook { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &CRDPrometheus) -> Result { - let ns = sender.namespace.clone(); +impl AlertReceiver for DiscordReceiver { + fn build(&self) -> Result { let secret_name = format!("{}-secret", self.name.clone()); let webhook_key = format!("{}", self.url.clone()); let mut string_data = BTreeMap::new(); string_data.insert("webhook-url".to_string(), webhook_key.clone()); - let secret = Secret { - metadata: kube::core::ObjectMeta { - name: Some(secret_name.clone()), - ..Default::default() - }, - string_data: Some(string_data), - type_: Some("Opaque".to_string()), - ..Default::default() - }; + let receiver_config = json!({ + "name": self.name, + "discordConfigs": [ + { + "apiURL": { + "key": "webhook-url", + "name": format!("{}-secret", self.name) + }, + "title": "{{ template \"discord.default.title\" . }}", + "message": "{{ template \"discord.default.message\" . }}" + } + ] + }); - let _ = sender.client.apply(&secret, Some(&ns)).await; - - let spec = AlertmanagerConfigSpec { - data: json!({ - "route": { - "receiver": self.name, - }, - "receivers": [ - { - "name": self.name, - "discordConfigs": [ - { - "apiURL": { - "name": secret_name, - "key": "webhook-url", - }, - "title": "{{ template \"discord.default.title\" . }}", - "message": "{{ template \"discord.default.message\" . }}" - } - ] - } - ] - }), - }; - - let alertmanager_configs = AlertmanagerConfig { - metadata: ObjectMeta { - name: Some(self.name.clone().to_string()), - labels: Some(std::collections::BTreeMap::from([( - "alertmanagerConfig".to_string(), - "enabled".to_string(), - )])), - namespace: Some(ns), - ..Default::default() - }, - spec, - }; - - sender - .client - .apply(&alertmanager_configs, Some(&sender.namespace)) - .await?; - Ok(Outcome::success(format!( - "installed crd-alertmanagerconfigs for {}", - self.name - ))) + Ok(ReceiverInstallPlan { + install_operation: Some(vec![InstallOperation::CreateSecret { + name: secret_name, + data: string_data, + }]), + route: Some(self.route.clone()), + receiver: Some( + serde_yaml::to_value(receiver_config) + .map_err(|e| InterpretError::new(e.to_string())) + .expect("failed to build yaml value"), + ), + }) } + fn name(&self) -> String { - "discord-webhook".to_string() + self.name.clone() } - fn clone_box(&self) -> Box> { + + fn clone_box(&self) -> Box> { Box::new(self.clone()) } - fn as_any(&self) -> &dyn Any { - self - } } -#[async_trait] -impl AlertReceiver for DiscordWebhook { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &Prometheus) -> Result { - sender.install_receiver(self).await +impl AlertReceiver for DiscordReceiver { + fn build(&self) -> Result { + let receiver_block = serde_yaml::to_value(json!({ + "name": self.name, + "discord_configs": [{ + "webhook_url": format!("{}", self.url), + "title": "{{ template \"discord.default.title\" . }}", + "message": "{{ template \"discord.default.message\" . }}" + }] + })) + .map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver_block), + }) } + fn name(&self) -> String { - "discord-webhook".to_string() + self.name.clone() } - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - fn as_any(&self) -> &dyn Any { - self - } -} -#[async_trait] -impl PrometheusReceiver for DiscordWebhook { - fn name(&self) -> String { - self.name.clone().to_string() - } - async fn configure_receiver(&self) -> AlertManagerChannelConfig { - self.get_config().await - } -} - -#[async_trait] -impl AlertReceiver for DiscordWebhook { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &KubePrometheus) -> Result { - sender.install_receiver(self).await - } fn clone_box(&self) -> Box> { Box::new(self.clone()) } - fn name(&self) -> String { - "discord-webhook".to_string() - } - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl KubePrometheusReceiver for DiscordWebhook { - fn name(&self) -> String { - self.name.clone().to_string() - } - async fn configure_receiver(&self) -> AlertManagerChannelConfig { - self.get_config().await - } -} - -#[async_trait] -impl AlertChannelConfig for DiscordWebhook { - async fn get_config(&self) -> AlertManagerChannelConfig { - let channel_global_config = None; - let channel_receiver = self.alert_channel_receiver().await; - let channel_route = self.alert_channel_route().await; - - AlertManagerChannelConfig { - channel_global_config, - channel_receiver, - channel_route, - } - } -} - -impl DiscordWebhook { - async fn alert_channel_route(&self) -> serde_yaml::Value { - let mut route = Mapping::new(); - route.insert( - Value::String("receiver".to_string()), - Value::String(self.name.clone().to_string()), - ); - route.insert( - Value::String("matchers".to_string()), - Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), - ); - route.insert(Value::String("continue".to_string()), Value::Bool(true)); - Value::Mapping(route) - } - - async fn alert_channel_receiver(&self) -> serde_yaml::Value { - let mut receiver = Mapping::new(); - receiver.insert( - Value::String("name".to_string()), - Value::String(self.name.clone().to_string()), - ); - - let mut discord_config = Mapping::new(); - discord_config.insert( - Value::String("webhook_url".to_string()), - Value::String(self.url.to_string()), - ); - - receiver.insert( - Value::String("discord_configs".to_string()), - Value::Sequence(vec![Value::Mapping(discord_config)]), - ); - - Value::Mapping(receiver) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn discord_serialize_should_match() { - let discord_receiver = DiscordWebhook { - name: K8sName("test-discord".to_string()), - url: Url::Url(url::Url::parse("https://discord.i.dont.exist.com").unwrap()), - selectors: vec![], - }; - - let discord_receiver_receiver = - serde_yaml::to_string(&discord_receiver.alert_channel_receiver().await).unwrap(); - println!("receiver \n{:#}", discord_receiver_receiver); - let discord_receiver_receiver_yaml = r#"name: test-discord -discord_configs: -- webhook_url: https://discord.i.dont.exist.com/ -"# - .to_string(); - - let discord_receiver_route = - serde_yaml::to_string(&discord_receiver.alert_channel_route().await).unwrap(); - println!("route \n{:#}", discord_receiver_route); - let discord_receiver_route_yaml = r#"receiver: test-discord -matchers: -- alertname!=Watchdog -continue: true -"# - .to_string(); - - assert_eq!(discord_receiver_receiver, discord_receiver_receiver_yaml); - assert_eq!(discord_receiver_route, discord_receiver_route_yaml); - } } diff --git a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs index a141df02..740bfb81 100644 --- a/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs +++ b/harmony/src/modules/monitoring/alert_channel/webhook_receiver.rs @@ -1,25 +1,13 @@ -use std::any::Any; - -use async_trait::async_trait; -use kube::api::ObjectMeta; -use log::debug; use serde::Serialize; use serde_json::json; -use serde_yaml::{Mapping, Value}; use crate::{ - interpret::{InterpretError, Outcome}, + interpret::InterpretError, modules::monitoring::{ - kube_prometheus::{ - crd::{ - crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability, - }, - prometheus::{KubePrometheus, KubePrometheusReceiver}, - types::{AlertChannelConfig, AlertManagerChannelConfig}, - }, - prometheus::prometheus::{Prometheus, PrometheusReceiver}, + kube_prometheus::KubePrometheus, okd::OpenshiftClusterAlertSender, prometheus::Prometheus, + red_hat_cluster_observability::RedHatClusterObservability, }, - topology::oberservability::monitoring::{AlertManagerReceiver, AlertReceiver}, + topology::monitoring::{AlertReceiver, AlertRoute, ReceiverInstallPlan}, }; use harmony_types::net::Url; @@ -27,281 +15,115 @@ use harmony_types::net::Url; pub struct WebhookReceiver { pub name: String, pub url: Url, -} - -#[async_trait] -impl AlertReceiver for WebhookReceiver { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &RHOBObservability) -> Result { - let spec = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfigSpec { - data: json!({ - "route": { - "receiver": self.name, - }, - "receivers": [ - { - "name": self.name, - "webhookConfigs": [ - { - "url": self.url, - "httpConfig": { - "tlsConfig": { - "insecureSkipVerify": true - } - } - } - ] - } - ] - }), - }; - - let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::AlertmanagerConfig { - metadata: ObjectMeta { - name: Some(self.name.clone()), - labels: Some(std::collections::BTreeMap::from([( - "alertmanagerConfig".to_string(), - "enabled".to_string(), - )])), - namespace: Some(sender.namespace.clone()), - ..Default::default() - }, - spec, - }; - debug!( - "alert manager configs: \n{:#?}", - alertmanager_configs.clone() - ); - - sender - .client - .apply(&alertmanager_configs, Some(&sender.namespace)) - .await?; - Ok(Outcome::success(format!( - "installed rhob-alertmanagerconfigs for {}", - self.name - ))) - } - - fn name(&self) -> String { - "webhook-receiver".to_string() - } - - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl AlertReceiver for WebhookReceiver { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &CRDPrometheus) -> Result { - let spec = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfigSpec { - data: json!({ - "route": { - "receiver": self.name, - }, - "receivers": [ - { - "name": self.name, - "webhookConfigs": [ - { - "url": self.url, - } - ] - } - ] - }), - }; - - let alertmanager_configs = crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::AlertmanagerConfig { - metadata: ObjectMeta { - name: Some(self.name.clone()), - labels: Some(std::collections::BTreeMap::from([( - "alertmanagerConfig".to_string(), - "enabled".to_string(), - )])), - namespace: Some(sender.namespace.clone()), - ..Default::default() - }, - spec, - }; - debug!( - "alert manager configs: \n{:#?}", - alertmanager_configs.clone() - ); - - sender - .client - .apply(&alertmanager_configs, Some(&sender.namespace)) - .await?; - Ok(Outcome::success(format!( - "installed crd-alertmanagerconfigs for {}", - self.name - ))) - } - - fn name(&self) -> String { - "webhook-receiver".to_string() - } - - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl AlertReceiver for WebhookReceiver { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &Prometheus) -> Result { - sender.install_receiver(self).await - } - fn name(&self) -> String { - "webhook-receiver".to_string() - } - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl PrometheusReceiver for WebhookReceiver { - fn name(&self) -> String { - self.name.clone() - } - async fn configure_receiver(&self) -> AlertManagerChannelConfig { - self.get_config().await - } -} - -#[async_trait] -impl AlertReceiver for WebhookReceiver { - fn as_alertmanager_receiver(&self) -> Result { - todo!() - } - async fn install(&self, sender: &KubePrometheus) -> Result { - sender.install_receiver(self).await - } - fn name(&self) -> String { - "webhook-receiver".to_string() - } - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } - fn as_any(&self) -> &dyn Any { - self - } -} - -#[async_trait] -impl KubePrometheusReceiver for WebhookReceiver { - fn name(&self) -> String { - self.name.clone() - } - async fn configure_receiver(&self) -> AlertManagerChannelConfig { - self.get_config().await - } -} - -#[async_trait] -impl AlertChannelConfig for WebhookReceiver { - async fn get_config(&self) -> AlertManagerChannelConfig { - let channel_global_config = None; - let channel_receiver = self.alert_channel_receiver().await; - let channel_route = self.alert_channel_route().await; - - AlertManagerChannelConfig { - channel_global_config, - channel_receiver, - channel_route, - } - } + pub route: AlertRoute, } impl WebhookReceiver { - async fn alert_channel_route(&self) -> serde_yaml::Value { - let mut route = Mapping::new(); - route.insert( - Value::String("receiver".to_string()), - Value::String(self.name.clone()), - ); - route.insert( - Value::String("matchers".to_string()), - Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), - ); - route.insert(Value::String("continue".to_string()), Value::Bool(true)); - Value::Mapping(route) + fn build_receiver(&self) -> serde_json::Value { + json!({ + "name": self.name, + "webhookConfigs": [ + { + "url": self.url, + "httpConfig": { + "tlsConfig": { + "insecureSkipVerify": true + } + } + } + ]}) } - async fn alert_channel_receiver(&self) -> serde_yaml::Value { - let mut receiver = Mapping::new(); - receiver.insert( - Value::String("name".to_string()), - Value::String(self.name.clone()), - ); - - let mut webhook_config = Mapping::new(); - webhook_config.insert( - Value::String("url".to_string()), - Value::String(self.url.to_string()), - ); - - receiver.insert( - Value::String("webhook_configs".to_string()), - Value::Sequence(vec![Value::Mapping(webhook_config)]), - ); - - Value::Mapping(receiver) + fn build_route(&self) -> serde_json::Value { + json!({ + "name": self.name}) } } -#[cfg(test)] -mod tests { - use super::*; - #[tokio::test] - async fn webhook_serialize_should_match() { - let webhook_receiver = WebhookReceiver { - name: "test-webhook".to_string(), - url: Url::Url(url::Url::parse("https://webhook.i.dont.exist.com").unwrap()), - }; +impl AlertReceiver for WebhookReceiver { + fn name(&self) -> String { + self.name.clone() + } - let webhook_receiver_receiver = - serde_yaml::to_string(&webhook_receiver.alert_channel_receiver().await).unwrap(); - println!("receiver \n{:#}", webhook_receiver_receiver); - let webhook_receiver_receiver_yaml = r#"name: test-webhook -webhook_configs: -- url: https://webhook.i.dont.exist.com/ -"# - .to_string(); + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } - let webhook_receiver_route = - serde_yaml::to_string(&webhook_receiver.alert_channel_route().await).unwrap(); - println!("route \n{:#}", webhook_receiver_route); - let webhook_receiver_route_yaml = r#"receiver: test-webhook -matchers: -- alertname!=Watchdog -continue: true -"# - .to_string(); + fn build(&self) -> Result { + let receiver = self.build_receiver(); + let receiver = + serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?; - assert_eq!(webhook_receiver_receiver, webhook_receiver_receiver_yaml); - assert_eq!(webhook_receiver_route, webhook_receiver_route_yaml); + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver), + }) + } +} + +impl AlertReceiver for WebhookReceiver { + fn name(&self) -> String { + self.name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn build(&self) -> Result { + let receiver = self.build_receiver(); + let receiver = + serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver), + }) + } +} + +impl AlertReceiver for WebhookReceiver { + fn name(&self) -> String { + self.name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn build(&self) -> Result { + let receiver = self.build_receiver(); + let receiver = + serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver), + }) + } +} + +impl AlertReceiver for WebhookReceiver { + fn name(&self) -> String { + self.name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } + + fn build(&self) -> Result { + let receiver = self.build_receiver(); + let receiver = + serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?; + + Ok(ReceiverInstallPlan { + install_operation: None, + route: Some(self.route.clone()), + receiver: Some(receiver), + }) } } diff --git a/harmony/src/modules/prometheus/alerts/infra/dell_server.rs b/harmony/src/modules/monitoring/alert_rule/alerts/infra/dell_server.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/infra/dell_server.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/infra/dell_server.rs diff --git a/harmony/src/modules/prometheus/alerts/infra/mod.rs b/harmony/src/modules/monitoring/alert_rule/alerts/infra/mod.rs similarity index 53% rename from harmony/src/modules/prometheus/alerts/infra/mod.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/infra/mod.rs index 47fbc2f1..2b11bded 100644 --- a/harmony/src/modules/prometheus/alerts/infra/mod.rs +++ b/harmony/src/modules/monitoring/alert_rule/alerts/infra/mod.rs @@ -1 +1,2 @@ pub mod dell_server; +pub mod opnsense; diff --git a/harmony/src/modules/monitoring/alert_rule/alerts/infra/opnsense.rs b/harmony/src/modules/monitoring/alert_rule/alerts/infra/opnsense.rs new file mode 100644 index 00000000..c163a76d --- /dev/null +++ b/harmony/src/modules/monitoring/alert_rule/alerts/infra/opnsense.rs @@ -0,0 +1,15 @@ +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +pub fn high_http_error_rate() -> PrometheusAlertRule { + let expression = r#"( + sum(rate(http_requests_total{status=~"5.."}[5m])) by (job, route, service) + / + sum(rate(http_requests_total[5m])) by (job, route, service) +) > 0.05 and sum(rate(http_requests_total[5m])) by (job, route, service) > 10"#; + + PrometheusAlertRule::new("HighApplicationErrorRate", expression) + .for_duration("10m") + .label("severity", "warning") + .annotation("summary", "High HTTP error rate on {{ $labels.job }}") + .annotation("description", "Job {{ $labels.job }} (route {{ $labels.route }}) has an error rate > 5% over the last 10m.") +} diff --git a/harmony/src/modules/prometheus/alerts/k8s/deployment.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/deployment.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/deployment.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/deployment.rs diff --git a/harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/memory_usage.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/memory_usage.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/memory_usage.rs diff --git a/harmony/src/modules/prometheus/alerts/k8s/mod.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/mod.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/mod.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/mod.rs diff --git a/harmony/src/modules/prometheus/alerts/k8s/pod.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/pod.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/pod.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/pod.rs diff --git a/harmony/src/modules/prometheus/alerts/k8s/pvc.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/pvc.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/pvc.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/pvc.rs diff --git a/harmony/src/modules/prometheus/alerts/k8s/service.rs b/harmony/src/modules/monitoring/alert_rule/alerts/k8s/service.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/k8s/service.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/k8s/service.rs diff --git a/harmony/src/modules/prometheus/alerts/mod.rs b/harmony/src/modules/monitoring/alert_rule/alerts/mod.rs similarity index 100% rename from harmony/src/modules/prometheus/alerts/mod.rs rename to harmony/src/modules/monitoring/alert_rule/alerts/mod.rs diff --git a/harmony/src/modules/monitoring/alert_rule/mod.rs b/harmony/src/modules/monitoring/alert_rule/mod.rs index 846c7696..cb7bba9e 100644 --- a/harmony/src/modules/monitoring/alert_rule/mod.rs +++ b/harmony/src/modules/monitoring/alert_rule/mod.rs @@ -1 +1,2 @@ +pub mod alerts; pub mod prometheus_alert_rule; diff --git a/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs b/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs index e001c68a..2cc7fceb 100644 --- a/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs +++ b/harmony/src/modules/monitoring/alert_rule/prometheus_alert_rule.rs @@ -1,79 +1,13 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; -use async_trait::async_trait; use serde::Serialize; use crate::{ - interpret::{InterpretError, Outcome}, - modules::monitoring::{ - kube_prometheus::{ - prometheus::{KubePrometheus, KubePrometheusRule}, - types::{AlertGroup, AlertManagerAdditionalPromRules}, - }, - prometheus::prometheus::{Prometheus, PrometheusRule}, - }, - topology::oberservability::monitoring::AlertRule, + interpret::InterpretError, + modules::monitoring::{kube_prometheus::KubePrometheus, okd::OpenshiftClusterAlertSender}, + topology::monitoring::AlertRule, }; -#[async_trait] -impl AlertRule for AlertManagerRuleGroup { - async fn install(&self, sender: &KubePrometheus) -> Result { - sender.install_rule(self).await - } - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } -} - -#[async_trait] -impl AlertRule for AlertManagerRuleGroup { - async fn install(&self, sender: &Prometheus) -> Result { - sender.install_rule(self).await - } - fn clone_box(&self) -> Box> { - Box::new(self.clone()) - } -} - -#[async_trait] -impl PrometheusRule for AlertManagerRuleGroup { - fn name(&self) -> String { - self.name.clone() - } - async fn configure_rule(&self) -> AlertManagerAdditionalPromRules { - let mut additional_prom_rules = BTreeMap::new(); - - additional_prom_rules.insert( - self.name.clone(), - AlertGroup { - groups: vec![self.clone()], - }, - ); - AlertManagerAdditionalPromRules { - rules: additional_prom_rules, - } - } -} -#[async_trait] -impl KubePrometheusRule for AlertManagerRuleGroup { - fn name(&self) -> String { - self.name.clone() - } - async fn configure_rule(&self) -> AlertManagerAdditionalPromRules { - let mut additional_prom_rules = BTreeMap::new(); - - additional_prom_rules.insert( - self.name.clone(), - AlertGroup { - groups: vec![self.clone()], - }, - ); - AlertManagerAdditionalPromRules { - rules: additional_prom_rules, - } - } -} - impl AlertManagerRuleGroup { pub fn new(name: &str, rules: Vec) -> AlertManagerRuleGroup { AlertManagerRuleGroup { @@ -129,3 +63,55 @@ impl PrometheusAlertRule { self } } + +impl AlertRule for AlertManagerRuleGroup { + fn build_rule(&self) -> Result { + let name = self.name.clone(); + let mut rules: Vec = vec![]; + for rule in self.rules.clone() { + rules.push(rule.into()) + } + + let rule_groups = + vec![crate::modules::monitoring::okd::crd::alerting_rules::RuleGroup { name, rules }]; + + Ok(serde_json::to_value(rule_groups).map_err(|e| InterpretError::new(e.to_string()))?) + } + + fn name(&self) -> String { + self.name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } +} + +impl AlertRule for AlertManagerRuleGroup { + fn build_rule(&self) -> Result { + let name = self.name.clone(); + let mut rules: Vec< + crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::Rule, + > = vec![]; + for rule in self.rules.clone() { + rules.push(rule.into()) + } + + let rule_groups = vec![ + crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::RuleGroup { + name, + rules, + }, + ]; + + Ok(serde_json::to_value(rule_groups).map_err(|e| InterpretError::new(e.to_string()))?) + } + + fn name(&self) -> String { + self.name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } +} diff --git a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs index 8f6b6245..b72e1f43 100644 --- a/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/application_monitoring_score.rs @@ -5,32 +5,26 @@ use serde::Serialize; use crate::{ interpret::Interpret, - modules::{ - application::Application, - monitoring::{ - grafana::grafana::Grafana, kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus, - }, - prometheus::prometheus::PrometheusMonitoring, - }, + modules::{application::Application, monitoring::prometheus::Prometheus}, score::Score, topology::{ K8sclient, Topology, - oberservability::monitoring::{AlertReceiver, AlertingInterpret, ScrapeTarget}, + monitoring::{AlertReceiver, AlertingInterpret, Observability, ScrapeTarget}, }, }; #[derive(Debug, Clone, Serialize)] pub struct ApplicationMonitoringScore { - pub sender: CRDPrometheus, + pub sender: Prometheus, pub application: Arc, - pub receivers: Vec>>, + pub receivers: Vec>>, } -impl + K8sclient + Grafana> Score - for ApplicationMonitoringScore -{ +impl + K8sclient> Score for ApplicationMonitoringScore { fn create_interpret(&self) -> Box> { debug!("creating alerting interpret"); + //TODO will need to use k8sclient to apply service monitors or find a way to pass + //them to the AlertingInterpret potentially via Sender Prometheus Box::new(AlertingInterpret { sender: self.sender.clone(), receivers: self.receivers.clone(), diff --git a/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs index 6f45c883..76aab06c 100644 --- a/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs +++ b/harmony/src/modules/monitoring/application_monitoring/rhobs_application_monitoring_score.rs @@ -9,28 +9,27 @@ use crate::{ inventory::Inventory, modules::{ application::Application, - monitoring::kube_prometheus::crd::{ - crd_alertmanager_config::CRDPrometheus, rhob_alertmanager_config::RHOBObservability, - }, - prometheus::prometheus::PrometheusMonitoring, + monitoring::red_hat_cluster_observability::RedHatClusterObservability, }, score::Score, - topology::{PreparationOutcome, Topology, oberservability::monitoring::AlertReceiver}, + topology::{ + Topology, + monitoring::{AlertReceiver, AlertingInterpret, Observability}, + }, }; use harmony_types::id::Id; - #[derive(Debug, Clone, Serialize)] -pub struct ApplicationRHOBMonitoringScore { - pub sender: RHOBObservability, +pub struct ApplicationRedHatClusterMonitoringScore { + pub sender: RedHatClusterObservability, pub application: Arc, - pub receivers: Vec>>, + pub receivers: Vec>>, } -impl> Score - for ApplicationRHOBMonitoringScore +impl> Score + for ApplicationRedHatClusterMonitoringScore { fn create_interpret(&self) -> Box> { - Box::new(ApplicationRHOBMonitoringInterpret { + Box::new(ApplicationRedHatClusterMonitoringInterpret { score: self.clone(), }) } @@ -44,38 +43,28 @@ impl> Score } #[derive(Debug)] -pub struct ApplicationRHOBMonitoringInterpret { - score: ApplicationRHOBMonitoringScore, +pub struct ApplicationRedHatClusterMonitoringInterpret { + score: ApplicationRedHatClusterMonitoringScore, } #[async_trait] -impl> Interpret - for ApplicationRHOBMonitoringInterpret +impl> Interpret + for ApplicationRedHatClusterMonitoringInterpret { async fn execute( &self, inventory: &Inventory, topology: &T, ) -> Result { - let result = topology - .install_prometheus( - &self.score.sender, - inventory, - Some(self.score.receivers.clone()), - ) - .await; - - match result { - Ok(outcome) => match outcome { - PreparationOutcome::Success { details: _ } => { - Ok(Outcome::success("Prometheus installed".into())) - } - PreparationOutcome::Noop => { - Ok(Outcome::noop("Prometheus installation skipped".into())) - } - }, - Err(err) => Err(InterpretError::from(err)), - } + //TODO will need to use k8sclient to apply crd ServiceMonitor or find a way to pass + //them to the AlertingInterpret potentially via Sender RedHatClusterObservability + let alerting_interpret = AlertingInterpret { + sender: self.score.sender.clone(), + receivers: self.score.receivers.clone(), + rules: vec![], + scrape_targets: None, + }; + alerting_interpret.execute(inventory, topology).await } fn get_name(&self) -> InterpretName { diff --git a/harmony/src/modules/monitoring/grafana/grafana.rs b/harmony/src/modules/monitoring/grafana/grafana.rs index 5ab57c27..59ea900f 100644 --- a/harmony/src/modules/monitoring/grafana/grafana.rs +++ b/harmony/src/modules/monitoring/grafana/grafana.rs @@ -1,17 +1,41 @@ -use async_trait::async_trait; -use k8s_openapi::Resource; +use serde::Serialize; -use crate::{ - inventory::Inventory, - topology::{PreparationError, PreparationOutcome}, -}; +use crate::topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget}; -#[async_trait] -pub trait Grafana { - async fn ensure_grafana_operator( - &self, - inventory: &Inventory, - ) -> Result; - - async fn install_grafana(&self) -> Result; +#[derive(Debug, Clone, Serialize)] +pub struct Grafana { + pub namespace: String, +} + +impl AlertSender for Grafana { + fn name(&self) -> String { + "grafana".to_string() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } } diff --git a/harmony/src/modules/monitoring/grafana/grafana_alerting_score.rs b/harmony/src/modules/monitoring/grafana/grafana_alerting_score.rs new file mode 100644 index 00000000..09594f6c --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/grafana_alerting_score.rs @@ -0,0 +1,32 @@ +use serde::Serialize; + +use crate::{ + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{ + Topology, + monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget}, + }, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct GrafanaAlertingScore { + pub receivers: Vec>>, + pub rules: Vec>>, + pub scrape_targets: Option>>>, + pub sender: Grafana, +} + +impl> Score for GrafanaAlertingScore { + fn create_interpret(&self) -> Box> { + Box::new(AlertingInterpret { + sender: self.sender.clone(), + receivers: self.receivers.clone(), + rules: self.rules.clone(), + scrape_targets: self.scrape_targets.clone(), + }) + } + fn name(&self) -> String { + "HelmPrometheusAlertingScore".to_string() + } +} diff --git a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs b/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs deleted file mode 100644 index c9ccacb0..00000000 --- a/harmony/src/modules/monitoring/grafana/helm/helm_grafana.rs +++ /dev/null @@ -1,28 +0,0 @@ -use harmony_macros::hurl; -use non_blank_string_rs::NonBlankString; -use std::{collections::HashMap, str::FromStr}; - -use crate::modules::helm::chart::{HelmChartScore, HelmRepository}; - -pub fn grafana_helm_chart_score(ns: &str, namespace_scope: bool) -> HelmChartScore { - let mut values_overrides = HashMap::new(); - values_overrides.insert( - NonBlankString::from_str("namespaceScope").unwrap(), - namespace_scope.to_string(), - ); - HelmChartScore { - namespace: Some(NonBlankString::from_str(ns).unwrap()), - release_name: NonBlankString::from_str("grafana-operator").unwrap(), - chart_name: NonBlankString::from_str("grafana/grafana-operator").unwrap(), - chart_version: None, - values_overrides: Some(values_overrides), - values_yaml: None, - create_namespace: true, - install_only: true, - repository: Some(HelmRepository::new( - "grafana".to_string(), - hurl!("https://grafana.github.io/helm-charts"), - true, - )), - } -} diff --git a/harmony/src/modules/monitoring/grafana/helm/mod.rs b/harmony/src/modules/monitoring/grafana/helm/mod.rs deleted file mode 100644 index 1d35fdf0..00000000 --- a/harmony/src/modules/monitoring/grafana/helm/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod helm_grafana; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs b/harmony/src/modules/monitoring/grafana/k8s/crd/crd_grafana.rs similarity index 98% rename from harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs rename to harmony/src/modules/monitoring/grafana/k8s/crd/crd_grafana.rs index 386890e0..0f02a844 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/k8s/crd/crd_grafana.rs @@ -4,7 +4,7 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::crd_prometheuses::LabelSelector; +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheuses::LabelSelector; #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] #[kube( diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs b/harmony/src/modules/monitoring/grafana/k8s/crd/grafana_default_dashboard.rs similarity index 100% rename from harmony/src/modules/monitoring/kube_prometheus/crd/grafana_default_dashboard.rs rename to harmony/src/modules/monitoring/grafana/k8s/crd/grafana_default_dashboard.rs diff --git a/harmony/src/modules/monitoring/grafana/k8s/crd/mod.rs b/harmony/src/modules/monitoring/grafana/k8s/crd/mod.rs new file mode 100644 index 00000000..bc9ac6de --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/crd/mod.rs @@ -0,0 +1,3 @@ +pub mod crd_grafana; +pub mod grafana_default_dashboard; +pub mod rhob_grafana; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs b/harmony/src/modules/monitoring/grafana/k8s/crd/rhob_grafana.rs similarity index 97% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs rename to harmony/src/modules/monitoring/grafana/k8s/crd/rhob_grafana.rs index 65efab9a..ce13f61d 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_grafana.rs +++ b/harmony/src/modules/monitoring/grafana/k8s/crd/rhob_grafana.rs @@ -4,7 +4,7 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector; +use crate::modules::monitoring::red_hat_cluster_observability::crd::rhob_prometheuses::LabelSelector; #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] #[kube( diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs b/harmony/src/modules/monitoring/grafana/k8s/helm/grafana_operator.rs similarity index 100% rename from harmony/src/modules/monitoring/kube_prometheus/crd/grafana_operator.rs rename to harmony/src/modules/monitoring/grafana/k8s/helm/grafana_operator.rs diff --git a/harmony/src/modules/monitoring/grafana/k8s/helm/mod.rs b/harmony/src/modules/monitoring/grafana/k8s/helm/mod.rs new file mode 100644 index 00000000..97edab97 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/helm/mod.rs @@ -0,0 +1 @@ +pub mod grafana_operator; diff --git a/harmony/src/modules/monitoring/grafana/k8s/mod.rs b/harmony/src/modules/monitoring/grafana/k8s/mod.rs new file mode 100644 index 00000000..107a5bd7 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/mod.rs @@ -0,0 +1,7 @@ +pub mod crd; +pub mod helm; +pub mod score_ensure_grafana_ready; +pub mod score_grafana_alert_receiver; +pub mod score_grafana_datasource; +pub mod score_grafana_rule; +pub mod score_install_grafana; diff --git a/harmony/src/modules/monitoring/grafana/k8s/score_ensure_grafana_ready.rs b/harmony/src/modules/monitoring/grafana/k8s/score_ensure_grafana_ready.rs new file mode 100644 index 00000000..81c3d396 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/score_ensure_grafana_ready.rs @@ -0,0 +1,54 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sEnsureReadyScore { + pub sender: Grafana, +} + +impl Score for GrafanaK8sEnsureReadyScore { + fn name(&self) -> String { + "GrafanaK8sEnsureReadyScore".to_string() + } + + fn create_interpret(&self) -> Box> { + todo!() + } +} +// async fn ensure_ready( +// &self, +// inventory: &Inventory, +// ) -> Result { +// debug!("ensure grafana operator"); +// let client = self.k8s_client().await.unwrap(); +// let grafana_gvk = GroupVersionKind { +// group: "grafana.integreatly.org".to_string(), +// version: "v1beta1".to_string(), +// kind: "Grafana".to_string(), +// }; +// let name = "grafanas.grafana.integreatly.org"; +// let ns = "grafana"; +// +// let grafana_crd = client +// .get_resource_json_value(name, Some(ns), &grafana_gvk) +// .await; +// match grafana_crd { +// Ok(_) => { +// return Ok(PreparationOutcome::Success { +// details: "Found grafana CRDs in cluster".to_string(), +// }); +// } +// +// Err(_) => { +// return self +// .install_grafana_operator(inventory, Some("grafana")) +// .await; +// } +// }; +// } diff --git a/harmony/src/modules/monitoring/grafana/k8s/score_grafana_alert_receiver.rs b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_alert_receiver.rs new file mode 100644 index 00000000..9e4d15fa --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_alert_receiver.rs @@ -0,0 +1,24 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertReceiver}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sReceiverScore { + pub sender: Grafana, + pub receiver: Box>, +} + +impl Score for GrafanaK8sReceiverScore { + fn name(&self) -> String { + "GrafanaK8sReceiverScore".to_string() + } + + fn create_interpret(&self) -> Box> { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/grafana/k8s/score_grafana_datasource.rs b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_datasource.rs new file mode 100644 index 00000000..71b723b1 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_datasource.rs @@ -0,0 +1,83 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{K8sclient, Topology, monitoring::ScrapeTarget}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sDatasourceScore { + pub sender: Grafana, + pub scrape_target: Box>, +} + +impl Score for GrafanaK8sDatasourceScore { + fn name(&self) -> String { + "GrafanaK8sDatasourceScore".to_string() + } + + fn create_interpret(&self) -> Box> { + todo!() + } +} + +// fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option { +// let token_b64 = secret +// .data +// .get("token") +// .or_else(|| secret.data.get("data").and_then(|d| d.get("token"))) +// .and_then(|v| v.as_str())?; +// +// let bytes = general_purpose::STANDARD.decode(token_b64).ok()?; +// +// let s = String::from_utf8(bytes).ok()?; +// +// let cleaned = s +// .trim_matches(|c: char| c.is_whitespace() || c == '\0') +// .to_string(); +// Some(cleaned) +// } +// fn build_grafana_datasource( +// &self, +// name: &str, +// ns: &str, +// label_selector: &LabelSelector, +// url: &str, +// token: &str, +// ) -> GrafanaDatasource { +// let mut json_data = BTreeMap::new(); +// json_data.insert("timeInterval".to_string(), "5s".to_string()); +// +// GrafanaDatasource { +// metadata: ObjectMeta { +// name: Some(name.to_string()), +// namespace: Some(ns.to_string()), +// ..Default::default() +// }, +// spec: GrafanaDatasourceSpec { +// instance_selector: label_selector.clone(), +// allow_cross_namespace_import: Some(true), +// values_from: None, +// datasource: GrafanaDatasourceConfig { +// access: "proxy".to_string(), +// name: name.to_string(), +// rype: "prometheus".to_string(), +// url: url.to_string(), +// database: None, +// json_data: Some(GrafanaDatasourceJsonData { +// time_interval: Some("60s".to_string()), +// http_header_name1: Some("Authorization".to_string()), +// tls_skip_verify: Some(true), +// oauth_pass_thru: Some(true), +// }), +// secure_json_data: Some(GrafanaDatasourceSecureJsonData { +// http_header_value1: Some(format!("Bearer {token}")), +// }), +// is_default: Some(false), +// editable: Some(true), +// }, +// }, +// } +// } diff --git a/harmony/src/modules/monitoring/grafana/k8s/score_grafana_rule.rs b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_rule.rs new file mode 100644 index 00000000..1a3c19d1 --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/score_grafana_rule.rs @@ -0,0 +1,67 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertRule}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sRuleScore { + pub sender: Grafana, + pub rule: Box>, +} + +impl Score for GrafanaK8sRuleScore { + fn name(&self) -> String { + "GrafanaK8sRuleScore".to_string() + } + + fn create_interpret(&self) -> Box> { + todo!() + } +} + +// kind: Secret +// apiVersion: v1 +// metadata: +// name: credentials +// namespace: grafana +// stringData: +// PROMETHEUS_USERNAME: root +// PROMETHEUS_PASSWORD: secret +// type: Opaque +// --- +// apiVersion: grafana.integreatly.org/v1beta1 +// kind: GrafanaDatasource +// metadata: +// name: grafanadatasource-sample +// spec: +// valuesFrom: +// - targetPath: "basicAuthUser" +// valueFrom: +// secretKeyRef: +// name: "credentials" +// key: "PROMETHEUS_USERNAME" +// - targetPath: "secureJsonData.basicAuthPassword" +// valueFrom: +// secretKeyRef: +// name: "credentials" +// key: "PROMETHEUS_PASSWORD" +// instanceSelector: +// matchLabels: +// dashboards: "grafana" +// datasource: +// name: prometheus +// type: prometheus +// access: proxy +// basicAuth: true +// url: http://prometheus-service:9090 +// isDefault: true +// basicAuthUser: ${PROMETHEUS_USERNAME} +// jsonData: +// "tlsSkipVerify": true +// "timeInterval": "5s" +// secureJsonData: +// "basicAuthPassword": ${PROMETHEUS_PASSWORD} # diff --git a/harmony/src/modules/monitoring/grafana/k8s/score_install_grafana.rs b/harmony/src/modules/monitoring/grafana/k8s/score_install_grafana.rs new file mode 100644 index 00000000..e72d000f --- /dev/null +++ b/harmony/src/modules/monitoring/grafana/k8s/score_install_grafana.rs @@ -0,0 +1,223 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::grafana::grafana::Grafana, + score::Score, + topology::{HelmCommand, K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sInstallScore { + pub sender: Grafana, +} + +impl Score for GrafanaK8sInstallScore { + fn name(&self) -> String { + "GrafanaK8sEnsureReadyScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(GrafanaK8sInstallInterpret {}) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct GrafanaK8sInstallInterpret {} + +#[async_trait] +impl Interpret for GrafanaK8sInstallInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + todo!() + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("GrafanaK8sInstallInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} +// let score = grafana_operator_helm_chart_score(sender.namespace.clone()); +// +// score +// .create_interpret() +// .execute(inventory, self) +// .await +// .map_err(|e| PreparationError::new(e.to_string()))?; +// + +// +// fn build_grafana_dashboard( +// &self, +// ns: &str, +// label_selector: &LabelSelector, +// ) -> GrafanaDashboard { +// let graf_dashboard = GrafanaDashboard { +// metadata: ObjectMeta { +// name: Some(format!("grafana-dashboard-{}", ns)), +// namespace: Some(ns.to_string()), +// ..Default::default() +// }, +// spec: GrafanaDashboardSpec { +// resync_period: Some("30s".to_string()), +// instance_selector: label_selector.clone(), +// datasources: Some(vec![GrafanaDashboardDatasource { +// input_name: "DS_PROMETHEUS".to_string(), +// datasource_name: "thanos-openshift-monitoring".to_string(), +// }]), +// json: None, +// grafana_com: Some(GrafanaCom { +// id: 17406, +// revision: None, +// }), +// }, +// }; +// graf_dashboard +// } +// +// fn build_grafana(&self, ns: &str, labels: &BTreeMap) -> GrafanaCRD { +// let grafana = GrafanaCRD { +// metadata: ObjectMeta { +// name: Some(format!("grafana-{}", ns)), +// namespace: Some(ns.to_string()), +// labels: Some(labels.clone()), +// ..Default::default() +// }, +// spec: GrafanaSpec { +// config: None, +// admin_user: None, +// admin_password: None, +// ingress: None, +// persistence: None, +// resources: None, +// }, +// }; +// grafana +// } +// +// async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore { +// let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap(); +// let name = format!("{}-grafana", ns); +// let backend_service = format!("grafana-{}-service", ns); +// +// K8sIngressScore { +// name: fqdn::fqdn!(&name), +// host: fqdn::fqdn!(&domain), +// backend_service: fqdn::fqdn!(&backend_service), +// port: 3000, +// path: Some("/".to_string()), +// path_type: Some(PathType::Prefix), +// namespace: Some(fqdn::fqdn!(&ns)), +// ingress_class_name: Some("openshift-default".to_string()), +// } +// } +// #[async_trait] +// impl Grafana for K8sAnywhereTopology { +// async fn install_grafana(&self) -> Result { +// let ns = "grafana"; +// +// let mut label = BTreeMap::new(); +// +// label.insert("dashboards".to_string(), "grafana".to_string()); +// +// let label_selector = LabelSelector { +// match_labels: label.clone(), +// match_expressions: vec![], +// }; +// +// let client = self.k8s_client().await?; +// +// let grafana = self.build_grafana(ns, &label); +// +// client.apply(&grafana, Some(ns)).await?; +// //TODO change this to a ensure ready or something better than just a timeout +// client +// .wait_until_deployment_ready( +// "grafana-grafana-deployment", +// Some("grafana"), +// Some(Duration::from_secs(30)), +// ) +// .await?; +// +// let sa_name = "grafana-grafana-sa"; +// let token_secret_name = "grafana-sa-token-secret"; +// +// let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns); +// +// client.apply(&sa_token_secret, Some(ns)).await?; +// let secret_gvk = GroupVersionKind { +// group: "".to_string(), +// version: "v1".to_string(), +// kind: "Secret".to_string(), +// }; +// +// let secret = client +// .get_resource_json_value(token_secret_name, Some(ns), &secret_gvk) +// .await?; +// +// let token = format!( +// "Bearer {}", +// self.extract_and_normalize_token(&secret).unwrap() +// ); +// +// debug!("creating grafana clusterrole binding"); +// +// let clusterrolebinding = +// self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns); +// +// client.apply(&clusterrolebinding, Some(ns)).await?; +// +// debug!("creating grafana datasource crd"); +// +// let thanos_url = format!( +// "https://{}", +// self.get_domain("thanos-querier-openshift-monitoring") +// .await +// .unwrap() +// ); +// +// let thanos_openshift_datasource = self.build_grafana_datasource( +// "thanos-openshift-monitoring", +// ns, +// &label_selector, +// &thanos_url, +// &token, +// ); +// +// client.apply(&thanos_openshift_datasource, Some(ns)).await?; +// +// debug!("creating grafana dashboard crd"); +// let dashboard = self.build_grafana_dashboard(ns, &label_selector); +// +// client.apply(&dashboard, Some(ns)).await?; +// debug!("creating grafana ingress"); +// let grafana_ingress = self.build_grafana_ingress(ns).await; +// +// grafana_ingress +// .interpret(&Inventory::empty(), self) +// .await +// .map_err(|e| PreparationError::new(e.to_string()))?; +// +// Ok(PreparationOutcome::Success { +// details: "Installed grafana composants".to_string(), +// }) +// } +// } diff --git a/harmony/src/modules/monitoring/grafana/mod.rs b/harmony/src/modules/monitoring/grafana/mod.rs index 8dccab1d..3176b923 100644 --- a/harmony/src/modules/monitoring/grafana/mod.rs +++ b/harmony/src/modules/monitoring/grafana/mod.rs @@ -1,2 +1,3 @@ pub mod grafana; -pub mod helm; +pub mod grafana_alerting_score; +pub mod k8s; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs index a547dced..f544b603 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_alertmanager_config.rs @@ -1,91 +1,17 @@ -use std::sync::Arc; - -use async_trait::async_trait; use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::{ - interpret::InterpretError, - inventory::Inventory, - modules::{ - monitoring::{ - grafana::grafana::Grafana, kube_prometheus::crd::service_monitor::ServiceMonitor, - }, - prometheus::prometheus::PrometheusMonitoring, - }, - topology::{ - K8sclient, Topology, - installable::Installable, - oberservability::monitoring::{AlertReceiver, AlertSender, ScrapeTarget}, - }, -}; -use harmony_k8s::K8sClient; - -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(CustomResource, Serialize, Deserialize, Default, Debug, Clone, JsonSchema)] #[kube( group = "monitoring.coreos.com", - version = "v1alpha1", + version = "v1", kind = "AlertmanagerConfig", plural = "alertmanagerconfigs", - namespaced + namespaced, + derive = "Default" )] pub struct AlertmanagerConfigSpec { #[serde(flatten)] pub data: serde_json::Value, } - -#[derive(Debug, Clone, Serialize)] -pub struct CRDPrometheus { - pub namespace: String, - pub client: Arc, - pub service_monitor: Vec, -} - -impl AlertSender for CRDPrometheus { - fn name(&self) -> String { - "CRDAlertManager".to_string() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -#[async_trait] -impl + Grafana> Installable - for CRDPrometheus -{ - async fn configure(&self, inventory: &Inventory, topology: &T) -> Result<(), InterpretError> { - topology.ensure_grafana_operator(inventory).await?; - topology.ensure_prometheus_operator(self, inventory).await?; - Ok(()) - } - - async fn ensure_installed( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result<(), InterpretError> { - topology.install_grafana().await?; - topology.install_prometheus(&self, inventory, None).await?; - Ok(()) - } -} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs index 014650cb..060dcc3a 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_default_rules.rs @@ -1,4 +1,4 @@ -use crate::modules::prometheus::alerts::k8s::{ +use crate::modules::monitoring::alert_rule::alerts::k8s::{ deployment::alert_deployment_unavailable, pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, pvc::high_pvc_fill_rate_over_two_days, diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs index 6d08456b..f3202564 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_prometheus_rules.rs @@ -6,13 +6,14 @@ use serde::{Deserialize, Serialize}; use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; -#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema)] +#[derive(CustomResource, Default, Debug, Serialize, Deserialize, Clone, JsonSchema)] #[kube( group = "monitoring.coreos.com", version = "v1", kind = "PrometheusRule", plural = "prometheusrules", - namespaced + namespaced, + derive = "Default" )] #[serde(rename_all = "camelCase")] pub struct PrometheusRuleSpec { diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_scrape_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_scrape_config.rs index 24a28330..0973e118 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/crd_scrape_config.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/crd_scrape_config.rs @@ -1,23 +1,18 @@ -use std::net::IpAddr; +use std::collections::BTreeMap; -use async_trait::async_trait; use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::{ - modules::monitoring::kube_prometheus::crd::{ - crd_alertmanager_config::CRDPrometheus, crd_prometheuses::LabelSelector, - }, - topology::oberservability::monitoring::ScrapeTarget, -}; +use crate::modules::monitoring::kube_prometheus::crd::crd_prometheuses::LabelSelector; -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] +#[derive(CustomResource, Default, Serialize, Deserialize, Debug, Clone, JsonSchema)] #[kube( group = "monitoring.coreos.com", version = "v1alpha1", kind = "ScrapeConfig", plural = "scrapeconfigs", + derive = "Default", namespaced )] #[serde(rename_all = "camelCase")] @@ -70,8 +65,8 @@ pub struct ScrapeConfigSpec { #[serde(rename_all = "camelCase")] pub struct StaticConfig { pub targets: Vec, - - pub labels: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub labels: Option>, } /// Relabeling configuration for target or metric relabeling. diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs index c8cb8546..ff6b2ee8 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/crd/mod.rs @@ -1,22 +1,9 @@ pub mod crd_alertmanager_config; pub mod crd_alertmanagers; pub mod crd_default_rules; -pub mod crd_grafana; pub mod crd_prometheus_rules; pub mod crd_prometheuses; pub mod crd_scrape_config; -pub mod grafana_default_dashboard; -pub mod grafana_operator; pub mod prometheus_operator; -pub mod rhob_alertmanager_config; -pub mod rhob_alertmanagers; -pub mod rhob_cluster_observability_operator; -pub mod rhob_default_rules; -pub mod rhob_grafana; -pub mod rhob_monitoring_stack; -pub mod rhob_prometheus_rules; -pub mod rhob_prometheuses; -pub mod rhob_role; -pub mod rhob_service_monitor; pub mod role; pub mod service_monitor; diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs deleted file mode 100644 index 4e6e68e2..00000000 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanager_config.rs +++ /dev/null @@ -1,48 +0,0 @@ -use std::sync::Arc; - -use kube::CustomResource; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -use crate::topology::oberservability::monitoring::{AlertReceiver, AlertSender}; -use harmony_k8s::K8sClient; - -#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] -#[kube( - group = "monitoring.rhobs", - version = "v1alpha1", - kind = "AlertmanagerConfig", - plural = "alertmanagerconfigs", - namespaced -)] -pub struct AlertmanagerConfigSpec { - #[serde(flatten)] - pub data: serde_json::Value, -} - -#[derive(Debug, Clone, Serialize)] -pub struct RHOBObservability { - pub namespace: String, - pub client: Arc, -} - -impl AlertSender for RHOBObservability { - fn name(&self) -> String { - "RHOBAlertManager".to_string() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs b/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs deleted file mode 100644 index bc7ad9fe..00000000 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_cluster_observability_operator.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::str::FromStr; - -use non_blank_string_rs::NonBlankString; - -use crate::modules::helm::chart::HelmChartScore; -//TODO package chart or something for COO okd -pub fn rhob_cluster_observability_operator() -> HelmChartScore { - HelmChartScore { - namespace: None, - release_name: NonBlankString::from_str("").unwrap(), - chart_name: NonBlankString::from_str( - "oci://hub.nationtech.io/harmony/nt-prometheus-operator", - ) - .unwrap(), - chart_version: None, - values_overrides: None, - values_yaml: None, - create_namespace: true, - install_only: true, - repository: None, - } -} diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs index 7d6aec89..3f38f0b1 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/helm/kube_prometheus_helm_chart.rs @@ -1,20 +1,13 @@ use super::config::KubePrometheusConfig; -use log::debug; use non_blank_string_rs::NonBlankString; -use serde_yaml::{Mapping, Value}; use std::{ - collections::BTreeMap, str::FromStr, sync::{Arc, Mutex}, }; use crate::modules::{ helm::chart::HelmChartScore, - monitoring::kube_prometheus::types::{ - AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerConfig, - AlertManagerConfigSelector, AlertManagerRoute, AlertManagerSpec, AlertManagerValues, - ConfigReloader, Limits, PrometheusConfig, Requests, Resources, - }, + monitoring::kube_prometheus::types::{Limits, Requests, Resources}, }; pub fn kube_prometheus_helm_chart_score( @@ -66,7 +59,7 @@ pub fn kube_prometheus_helm_chart_score( } let _resource_section = resource_block(&resource_limit, 2); - let mut values = format!( + let values = format!( r#" global: rbac: @@ -281,131 +274,6 @@ prometheusOperator: "#, ); - let prometheus_config = - crate::modules::monitoring::kube_prometheus::types::PrometheusConfigValues { - prometheus: PrometheusConfig { - prometheus: bool::from_str(prometheus.as_str()).expect("couldn't parse bool"), - additional_service_monitors: config.additional_service_monitors.clone(), - }, - }; - let prometheus_config_yaml = - serde_yaml::to_string(&prometheus_config).expect("Failed to serialize YAML"); - - debug!( - "serialized prometheus config: \n {:#}", - prometheus_config_yaml - ); - values.push_str(&prometheus_config_yaml); - - // add required null receiver for prometheus alert manager - let mut null_receiver = Mapping::new(); - null_receiver.insert( - Value::String("receiver".to_string()), - Value::String("null".to_string()), - ); - null_receiver.insert( - Value::String("matchers".to_string()), - Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]), - ); - null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true)); - - //add alert channels - let mut alert_manager_channel_config = AlertManagerConfig { - global: Mapping::new(), - route: AlertManagerRoute { - routes: vec![Value::Mapping(null_receiver)], - }, - receivers: vec![serde_yaml::from_str("name: 'null'").unwrap()], - }; - for receiver in config.alert_receiver_configs.iter() { - if let Some(global) = receiver.channel_global_config.clone() { - alert_manager_channel_config - .global - .insert(global.0, global.1); - } - alert_manager_channel_config - .route - .routes - .push(receiver.channel_route.clone()); - alert_manager_channel_config - .receivers - .push(receiver.channel_receiver.clone()); - } - - let mut labels = BTreeMap::new(); - labels.insert("alertmanagerConfig".to_string(), "enabled".to_string()); - let alert_manager_config_selector = AlertManagerConfigSelector { - match_labels: labels, - }; - let alert_manager_values = AlertManagerValues { - alertmanager: AlertManager { - enabled: config.alert_manager, - config: alert_manager_channel_config, - alertmanager_spec: AlertManagerSpec { - resources: Resources { - limits: Limits { - memory: "100Mi".to_string(), - cpu: "100m".to_string(), - }, - requests: Requests { - memory: "100Mi".to_string(), - cpu: "100m".to_string(), - }, - }, - alert_manager_config_selector, - replicas: 2, - }, - init_config_reloader: ConfigReloader { - resources: Resources { - limits: Limits { - memory: "100Mi".to_string(), - cpu: "100m".to_string(), - }, - requests: Requests { - memory: "100Mi".to_string(), - cpu: "100m".to_string(), - }, - }, - }, - }, - }; - - let alert_manager_yaml = - serde_yaml::to_string(&alert_manager_values).expect("Failed to serialize YAML"); - debug!("serialized alert manager: \n {:#}", alert_manager_yaml); - values.push_str(&alert_manager_yaml); - - //format alert manager additional rules for helm chart - let mut merged_rules: BTreeMap = BTreeMap::new(); - - for additional_rule in config.alert_rules.clone() { - for (key, group) in additional_rule.rules { - merged_rules.insert(key, group); - } - } - - let merged_rules = AlertManagerAdditionalPromRules { - rules: merged_rules, - }; - - let mut alert_manager_additional_rules = serde_yaml::Mapping::new(); - let rules_value = serde_yaml::to_value(merged_rules).unwrap(); - - alert_manager_additional_rules.insert( - serde_yaml::Value::String("additionalPrometheusRulesMap".to_string()), - rules_value, - ); - - let alert_manager_additional_rules_yaml = - serde_yaml::to_string(&alert_manager_additional_rules).expect("Failed to serialize YAML"); - debug!( - "alert_rules_yaml:\n{:#}", - alert_manager_additional_rules_yaml - ); - - values.push_str(&alert_manager_additional_rules_yaml); - debug!("full values.yaml: \n {:#}", values); - HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()), release_name: NonBlankString::from_str("kube-prometheus").unwrap(), diff --git a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs b/harmony/src/modules/monitoring/kube_prometheus/kube_prometheus_alerting_score.rs similarity index 50% rename from harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs rename to harmony/src/modules/monitoring/kube_prometheus/kube_prometheus_alerting_score.rs index 468d3088..97f12d03 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/helm_prometheus_alert_score.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/kube_prometheus_alerting_score.rs @@ -2,36 +2,41 @@ use std::sync::{Arc, Mutex}; use serde::Serialize; -use super::{helm::config::KubePrometheusConfig, prometheus::KubePrometheus}; +use super::helm::config::KubePrometheusConfig; use crate::{ - modules::monitoring::kube_prometheus::types::ServiceMonitor, + modules::monitoring::kube_prometheus::{KubePrometheus, types::ServiceMonitor}, score::Score, topology::{ - HelmCommand, Topology, - oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, - tenant::TenantManager, + Topology, + monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget}, }, }; +//TODO untested #[derive(Clone, Debug, Serialize)] -pub struct HelmPrometheusAlertingScore { +pub struct KubePrometheusAlertingScore { pub receivers: Vec>>, pub rules: Vec>>, + pub scrape_targets: Option>>>, pub service_monitors: Vec, + pub config: Arc>, } -impl Score for HelmPrometheusAlertingScore { +impl> Score for KubePrometheusAlertingScore { fn create_interpret(&self) -> Box> { - let config = Arc::new(Mutex::new(KubePrometheusConfig::new())); - config + //TODO test that additional service monitor is added + self.config .try_lock() .expect("couldn't lock config") .additional_service_monitors = self.service_monitors.clone(); + Box::new(AlertingInterpret { - sender: KubePrometheus { config }, + sender: KubePrometheus { + config: self.config.clone(), + }, receivers: self.receivers.clone(), rules: self.rules.clone(), - scrape_targets: None, + scrape_targets: self.scrape_targets.clone(), }) } fn name(&self) -> String { diff --git a/harmony/src/modules/monitoring/kube_prometheus/mod.rs b/harmony/src/modules/monitoring/kube_prometheus/mod.rs index 122e9396..7fed749f 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/mod.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/mod.rs @@ -1,5 +1,71 @@ +use std::sync::{Arc, Mutex}; + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + modules::monitoring::kube_prometheus::helm::config::KubePrometheusConfig, + topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget}, +}; + pub mod crd; pub mod helm; -pub mod helm_prometheus_alert_score; -pub mod prometheus; +pub mod kube_prometheus_alerting_score; +pub mod score_kube_prometheus_alert_receivers; +pub mod score_kube_prometheus_ensure_ready; +pub mod score_kube_prometheus_rule; +pub mod score_kube_prometheus_scrape_target; pub mod types; + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +#[async_trait] +impl AlertSender for KubePrometheus { + fn name(&self) -> String { + "HelmKubePrometheus".to_string() + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct KubePrometheus { + pub config: Arc>, +} + +impl Default for KubePrometheus { + fn default() -> Self { + Self::new() + } +} + +impl KubePrometheus { + pub fn new() -> Self { + Self { + config: Arc::new(Mutex::new(KubePrometheusConfig::new())), + } + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs b/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs deleted file mode 100644 index 98970e6c..00000000 --- a/harmony/src/modules/monitoring/kube_prometheus/prometheus.rs +++ /dev/null @@ -1,167 +0,0 @@ -use std::sync::{Arc, Mutex}; - -use async_trait::async_trait; -use log::{debug, error}; -use serde::Serialize; - -use crate::{ - interpret::{InterpretError, Outcome}, - inventory::Inventory, - modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, - score, - topology::{ - HelmCommand, Topology, - installable::Installable, - oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, - tenant::TenantManager, - }, -}; - -use score::Score; - -use super::{ - helm::{ - config::KubePrometheusConfig, kube_prometheus_helm_chart::kube_prometheus_helm_chart_score, - }, - types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, -}; - -#[async_trait] -impl AlertSender for KubePrometheus { - fn name(&self) -> String { - "HelmKubePrometheus".to_string() - } -} - -#[async_trait] -impl Installable for KubePrometheus { - async fn configure(&self, _inventory: &Inventory, topology: &T) -> Result<(), InterpretError> { - self.configure_with_topology(topology).await; - Ok(()) - } - - async fn ensure_installed( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result<(), InterpretError> { - self.install_prometheus(inventory, topology).await?; - Ok(()) - } -} - -#[derive(Debug)] -pub struct KubePrometheus { - pub config: Arc>, -} - -impl Default for KubePrometheus { - fn default() -> Self { - Self::new() - } -} - -impl KubePrometheus { - pub fn new() -> Self { - Self { - config: Arc::new(Mutex::new(KubePrometheusConfig::new())), - } - } - - pub async fn configure_with_topology(&self, topology: &T) { - let ns = topology - .get_tenant_config() - .await - .map(|cfg| cfg.name.clone()) - .unwrap_or_else(|| "monitoring".to_string()); - error!("This must be refactored, see comments in pr #74"); - debug!("NS: {}", ns); - self.config.lock().unwrap().namespace = Some(ns); - } - - pub async fn install_receiver( - &self, - prometheus_receiver: &dyn KubePrometheusReceiver, - ) -> Result { - let prom_receiver = prometheus_receiver.configure_receiver().await; - debug!( - "adding alert receiver to prometheus config: {:#?}", - &prom_receiver - ); - let mut config = self.config.lock().unwrap(); - - config.alert_receiver_configs.push(prom_receiver); - let prom_receiver_name = prometheus_receiver.name(); - debug!("installed alert receiver {}", &prom_receiver_name); - Ok(Outcome::success(format!( - "Sucessfully installed receiver {}", - prom_receiver_name - ))) - } - - pub async fn install_rule( - &self, - prometheus_rule: &AlertManagerRuleGroup, - ) -> Result { - let prometheus_rule = prometheus_rule.configure_rule().await; - let mut config = self.config.lock().unwrap(); - - config.alert_rules.push(prometheus_rule.clone()); - Ok(Outcome::success(format!( - "Successfully installed alert rule: {:#?},", - prometheus_rule - ))) - } - - pub async fn install_prometheus( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result { - kube_prometheus_helm_chart_score(self.config.clone()) - .interpret(inventory, topology) - .await - } -} - -#[async_trait] -pub trait KubePrometheusReceiver: Send + Sync + std::fmt::Debug { - fn name(&self) -> String; - async fn configure_receiver(&self) -> AlertManagerChannelConfig; -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -#[async_trait] -pub trait KubePrometheusRule: Send + Sync + std::fmt::Debug { - fn name(&self) -> String; - async fn configure_rule(&self) -> AlertManagerAdditionalPromRules; -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} diff --git a/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_alert_receivers.rs b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_alert_receivers.rs new file mode 100644 index 00000000..9fd80e80 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_alert_receivers.rs @@ -0,0 +1,61 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::kube_prometheus::{ + KubePrometheus, + crd::crd_alertmanager_config::{AlertmanagerConfig, AlertmanagerConfigSpec}, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertReceiver}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct KubePrometheusReceiverScore { + pub sender: KubePrometheus, + pub receiver: Box>, +} + +impl Score for KubePrometheusReceiverScore { + fn name(&self) -> String { + "KubePrometheusReceiverScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.receiver.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + let install_plan = self.receiver.build().expect("failed to build install plan"); + + let route = install_plan.route.expect(&format!( + "failed to build route for receveiver {}", + name.clone() + )); + + let route = serde_yaml::to_value(route).expect("failed to serialize route object to yaml"); + + let receiver = install_plan.receiver.expect(&format!( + "failed to build receiver path for receiver {}", + name.clone() + )); + + let data = serde_json::json!({ + "route": route, + "receivers": [receiver] + }); + + let alertmanager_config = AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(name), + namespace: namespace.clone(), + ..Default::default() + }, + spec: AlertmanagerConfigSpec { data: data }, + }; + + K8sResourceScore::single(alertmanager_config, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_ensure_ready.rs b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_ensure_ready.rs new file mode 100644 index 00000000..1c3a6665 --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_ensure_ready.rs @@ -0,0 +1,80 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::kube_prometheus::KubePrometheus, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct KubePrometheusEnsureReadyScore { + pub sender: KubePrometheus, +} + +impl Score for KubePrometheusEnsureReadyScore { + fn name(&self) -> String { + "KubePrometheusEnsureReadyScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(KubePrometheusEnsureReadyInterpret { + sender: self.sender.clone(), + }) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct KubePrometheusEnsureReadyInterpret { + pub sender: KubePrometheus, +} + +#[async_trait] +impl Interpret for KubePrometheusEnsureReadyInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await?; + let namespace = self + .sender + .config + .lock() + .unwrap() + .namespace + .clone() + .unwrap_or("default".to_string()); + + let prometheus_name = "kube-prometheues-kube-prometheus-operator"; + + client + .wait_until_deployment_ready(prometheus_name, Some(&namespace), None) + .await?; + + Ok(Outcome::success(format!( + "deployment: {} ready in ns: {}", + prometheus_name, namespace + ))) + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_rule.rs b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_rule.rs new file mode 100644 index 00000000..86a5b42a --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_rule.rs @@ -0,0 +1,46 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::kube_prometheus::{ + KubePrometheus, + crd::crd_prometheus_rules::{PrometheusRule, PrometheusRuleSpec, RuleGroup}, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertRule}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct KubePrometheusRuleScore { + pub sender: KubePrometheus, + pub rule: Box>, +} + +impl Score for KubePrometheusRuleScore { + fn name(&self) -> String { + "KubePrometheusRuleScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.rule.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + let groups: Vec = + serde_json::from_value(self.rule.build_rule().expect("failed to build alert rule")) + .expect("failed to serialize rule group"); + + let prometheus_rule = PrometheusRule { + metadata: ObjectMeta { + name: Some(name.clone()), + namespace: namespace.clone(), + ..Default::default() + }, + + spec: PrometheusRuleSpec { groups }, + }; + K8sResourceScore::single(prometheus_rule, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_scrape_target.rs b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_scrape_target.rs new file mode 100644 index 00000000..9e34fb0f --- /dev/null +++ b/harmony/src/modules/monitoring/kube_prometheus/score_kube_prometheus_scrape_target.rs @@ -0,0 +1,61 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::kube_prometheus::{ + KubePrometheus, + crd::crd_scrape_config::{ScrapeConfig, ScrapeConfigSpec, StaticConfig}, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::ScrapeTarget}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct KubePrometheusScrapeTargetScore { + pub sender: KubePrometheus, + pub scrape_target: Box>, +} + +impl Score for KubePrometheusScrapeTargetScore { + fn name(&self) -> String { + "KubePrometheusScrapeTargetScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.scrape_target.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + + let external_target = self + .scrape_target + .build_scrape_target() + .expect("failed to build external scrape target"); + + //TODO this may need to modified to include a scrapeConfigSelector label from the + //prometheus operator + let labels = external_target.labels; + + let scrape_target = ScrapeConfig { + metadata: ObjectMeta { + name: Some(name.clone()), + namespace: namespace.clone(), + ..Default::default() + }, + spec: ScrapeConfigSpec { + static_configs: Some(vec![StaticConfig { + targets: vec![format!("{}:{}", external_target.ip, external_target.port)], + labels, + }]), + metrics_path: external_target.path, + scrape_interval: external_target.interval, + job_name: Some(name), + ..Default::default() + }, + }; + + K8sResourceScore::single(scrape_target, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/types.rs b/harmony/src/modules/monitoring/kube_prometheus/types.rs index abe5896a..673a710a 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/types.rs +++ b/harmony/src/modules/monitoring/kube_prometheus/types.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, HashMap}; use async_trait::async_trait; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use serde_yaml::{Mapping, Sequence, Value}; +use serde_yaml::Value; use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup; @@ -12,36 +12,6 @@ pub trait AlertChannelConfig { async fn get_config(&self) -> AlertManagerChannelConfig; } -#[derive(Debug, Clone, Serialize)] -pub struct AlertManagerValues { - pub alertmanager: AlertManager, -} -#[derive(Debug, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct AlertManager { - pub enabled: bool, - pub config: AlertManagerConfig, - pub alertmanager_spec: AlertManagerSpec, - pub init_config_reloader: ConfigReloader, -} - -#[derive(Debug, Clone, Serialize)] -pub struct ConfigReloader { - pub resources: Resources, -} - -#[derive(Debug, Clone, Serialize)] -pub struct AlertManagerConfig { - pub global: Mapping, - pub route: AlertManagerRoute, - pub receivers: Sequence, -} - -#[derive(Debug, Clone, Serialize)] -pub struct AlertManagerRoute { - pub routes: Sequence, -} - #[derive(Debug, Clone, Serialize)] pub struct AlertManagerChannelConfig { ///expecting an option that contains two values @@ -52,20 +22,6 @@ pub struct AlertManagerChannelConfig { pub channel_receiver: Value, } -#[derive(Debug, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct AlertManagerSpec { - pub(crate) resources: Resources, - pub replicas: u32, - pub alert_manager_config_selector: AlertManagerConfigSelector, -} - -#[derive(Debug, Clone, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct AlertManagerConfigSelector { - pub match_labels: BTreeMap, -} - #[derive(Debug, Clone, Serialize)] pub struct Resources { pub limits: Limits, diff --git a/harmony/src/modules/monitoring/mod.rs b/harmony/src/modules/monitoring/mod.rs index 7f07d5af..a9ace9fc 100644 --- a/harmony/src/modules/monitoring/mod.rs +++ b/harmony/src/modules/monitoring/mod.rs @@ -6,4 +6,5 @@ pub mod kube_prometheus; pub mod ntfy; pub mod okd; pub mod prometheus; +pub mod red_hat_cluster_observability; pub mod scrape_target; diff --git a/harmony/src/modules/monitoring/okd/cluster_monitoring.rs b/harmony/src/modules/monitoring/okd/cluster_monitoring.rs deleted file mode 100644 index 56fb59e0..00000000 --- a/harmony/src/modules/monitoring/okd/cluster_monitoring.rs +++ /dev/null @@ -1,270 +0,0 @@ -use base64::prelude::*; - -use async_trait::async_trait; -use harmony_types::id::Id; -use kube::api::DynamicObject; -use log::{debug, info, trace}; -use serde::Serialize; - -use crate::{ - data::Version, - interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, - inventory::Inventory, - modules::monitoring::okd::OpenshiftClusterAlertSender, - score::Score, - topology::{K8sclient, Topology, oberservability::monitoring::AlertReceiver}, -}; - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -#[derive(Debug, Clone, Serialize)] -pub struct OpenshiftClusterAlertScore { - pub receivers: Vec>>, -} - -impl Score for OpenshiftClusterAlertScore { - fn name(&self) -> String { - "ClusterAlertScore".to_string() - } - - #[doc(hidden)] - fn create_interpret(&self) -> Box> { - Box::new(OpenshiftClusterAlertInterpret { - receivers: self.receivers.clone(), - }) - } -} - -#[derive(Debug)] -pub struct OpenshiftClusterAlertInterpret { - receivers: Vec>>, -} - -#[async_trait] -impl Interpret for OpenshiftClusterAlertInterpret { - async fn execute( - &self, - _inventory: &Inventory, - topology: &T, - ) -> Result { - let client = topology.k8s_client().await?; - let openshift_monitoring_namespace = "openshift-monitoring"; - - let mut alertmanager_main_secret: DynamicObject = client - .get_secret_json_value("alertmanager-main", Some(openshift_monitoring_namespace)) - .await?; - trace!("Got secret {alertmanager_main_secret:#?}"); - - let data: &mut serde_json::Value = &mut alertmanager_main_secret.data; - trace!("Alertmanager-main secret data {data:#?}"); - let data_obj = data - .get_mut("data") - .ok_or(InterpretError::new( - "Missing 'data' field in alertmanager-main secret.".to_string(), - ))? - .as_object_mut() - .ok_or(InterpretError::new( - "'data' field in alertmanager-main secret is expected to be an object ." - .to_string(), - ))?; - - let config_b64 = data_obj - .get("alertmanager.yaml") - .ok_or(InterpretError::new( - "Missing 'alertmanager.yaml' in alertmanager-main secret data".to_string(), - ))? - .as_str() - .unwrap_or(""); - trace!("Config base64 {config_b64}"); - - let config_bytes = BASE64_STANDARD.decode(config_b64).unwrap_or_default(); - - let mut am_config: serde_yaml::Value = - serde_yaml::from_str(&String::from_utf8(config_bytes).unwrap_or_default()) - .unwrap_or_default(); - - debug!("Current alertmanager config {am_config:#?}"); - - let existing_receivers_sequence = if let Some(receivers) = am_config.get_mut("receivers") { - match receivers.as_sequence_mut() { - Some(seq) => seq, - None => { - return Err(InterpretError::new(format!( - "Expected alertmanager config receivers to be a sequence, got {:?}", - receivers - ))); - } - } - } else { - &mut serde_yaml::Sequence::default() - }; - - let mut additional_resources = vec![]; - - for custom_receiver in &self.receivers { - let name = custom_receiver.name(); - let alertmanager_receiver = custom_receiver.as_alertmanager_receiver()?; - - let receiver_json_value = alertmanager_receiver.receiver_config; - - let receiver_yaml_string = - serde_json::to_string(&receiver_json_value).map_err(|e| { - InterpretError::new(format!("Failed to serialize receiver config: {}", e)) - })?; - - let receiver_yaml_value: serde_yaml::Value = - serde_yaml::from_str(&receiver_yaml_string).map_err(|e| { - InterpretError::new(format!("Failed to parse receiver config as YAML: {}", e)) - })?; - - if let Some(idx) = existing_receivers_sequence.iter().position(|r| { - r.get("name") - .and_then(|n| n.as_str()) - .map_or(false, |n| n == name) - }) { - info!("Replacing existing AlertManager receiver: {}", name); - existing_receivers_sequence[idx] = receiver_yaml_value; - } else { - debug!("Adding new AlertManager receiver: {}", name); - existing_receivers_sequence.push(receiver_yaml_value); - } - - additional_resources.push(alertmanager_receiver.additional_ressources); - } - - let existing_route_mapping = if let Some(route) = am_config.get_mut("route") { - match route.as_mapping_mut() { - Some(map) => map, - None => { - return Err(InterpretError::new(format!( - "Expected alertmanager config route to be a mapping, got {:?}", - route - ))); - } - } - } else { - &mut serde_yaml::Mapping::default() - }; - - let existing_route_sequence = if let Some(routes) = existing_route_mapping.get_mut("routes") - { - match routes.as_sequence_mut() { - Some(seq) => seq, - None => { - return Err(InterpretError::new(format!( - "Expected alertmanager config routes to be a sequence, got {:?}", - routes - ))); - } - } - } else { - &mut serde_yaml::Sequence::default() - }; - - for custom_receiver in &self.receivers { - let name = custom_receiver.name(); - let alertmanager_receiver = custom_receiver.as_alertmanager_receiver()?; - - let route_json_value = alertmanager_receiver.route_config; - let route_yaml_string = serde_json::to_string(&route_json_value).map_err(|e| { - InterpretError::new(format!("Failed to serialize route config: {}", e)) - })?; - - let route_yaml_value: serde_yaml::Value = serde_yaml::from_str(&route_yaml_string) - .map_err(|e| { - InterpretError::new(format!("Failed to parse route config as YAML: {}", e)) - })?; - - if let Some(idy) = existing_route_sequence.iter().position(|r| { - r.get("receiver") - .and_then(|n| n.as_str()) - .map_or(false, |n| n == name) - }) { - info!("Replacing existing AlertManager receiver: {}", name); - existing_route_sequence[idy] = route_yaml_value; - } else { - debug!("Adding new AlertManager receiver: {}", name); - existing_route_sequence.push(route_yaml_value); - } - } - debug!("Current alertmanager config {am_config:#?}"); - // TODO - // - save new version of alertmanager config - // - write additional ressources to the cluster - let am_config = serde_yaml::to_string(&am_config).map_err(|e| { - InterpretError::new(format!( - "Failed to serialize new alertmanager config to string : {e}" - )) - })?; - - let mut am_config_b64 = String::new(); - BASE64_STANDARD.encode_string(am_config, &mut am_config_b64); - - // TODO put update configmap value and save new value - data_obj.insert( - "alertmanager.yaml".to_string(), - serde_json::Value::String(am_config_b64), - ); - - // https://kubernetes.io/docs/reference/using-api/server-side-apply/#field-management - alertmanager_main_secret.metadata.managed_fields = None; - - trace!("Applying new alertmanager_main_secret {alertmanager_main_secret:#?}"); - client - .apply_dynamic( - &alertmanager_main_secret, - Some(openshift_monitoring_namespace), - true, - ) - .await?; - - let additional_resources = additional_resources.concat(); - trace!("Applying additional ressources for alert receivers {additional_resources:#?}"); - client - .apply_dynamic_many( - &additional_resources, - Some(openshift_monitoring_namespace), - true, - ) - .await?; - - Ok(Outcome::success(format!( - "Successfully configured {} cluster alert receivers: {}", - self.receivers.len(), - self.receivers - .iter() - .map(|r| r.name()) - .collect::>() - .join(", ") - ))) - } - - fn get_name(&self) -> InterpretName { - InterpretName::Custom("OpenshiftClusterAlertInterpret") - } - - fn get_version(&self) -> Version { - todo!() - } - - fn get_status(&self) -> InterpretStatus { - todo!() - } - - fn get_children(&self) -> Vec { - todo!() - } -} diff --git a/harmony/src/modules/monitoring/okd/crd/alerting_rules.rs b/harmony/src/modules/monitoring/okd/crd/alerting_rules.rs new file mode 100644 index 00000000..688198ea --- /dev/null +++ b/harmony/src/modules/monitoring/okd/crd/alerting_rules.rs @@ -0,0 +1,58 @@ +use std::collections::BTreeMap; + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema, Default)] +#[kube( + group = "monitoring.openshift.io", + version = "v1", + kind = "AlertingRule", + plural = "alertingrules", + namespaced, + derive = "Default" +)] +#[serde(rename_all = "camelCase")] +pub struct AlertingRuleSpec { + pub groups: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct RuleGroup { + pub name: String, + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct Rule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alert: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expr: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub for_: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option>, +} + +impl From for Rule { + fn from(value: PrometheusAlertRule) -> Self { + Rule { + alert: Some(value.alert), + expr: Some(value.expr), + for_: value.r#for, + labels: Some(value.labels.into_iter().collect::>()), + annotations: Some(value.annotations.into_iter().collect::>()), + } + } +} diff --git a/harmony/src/modules/monitoring/okd/crd/mod.rs b/harmony/src/modules/monitoring/okd/crd/mod.rs new file mode 100644 index 00000000..a3b92bd0 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/crd/mod.rs @@ -0,0 +1,3 @@ +pub mod alerting_rules; +pub mod scrape_target; +pub mod service_monitor; diff --git a/harmony/src/modules/monitoring/okd/crd/scrape_target.rs b/harmony/src/modules/monitoring/okd/crd/scrape_target.rs new file mode 100644 index 00000000..6a77f003 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/crd/scrape_target.rs @@ -0,0 +1,72 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema, Default)] +#[kube( + group = "monitoring.coreos.com", + version = "v1alpha1", + kind = "ScrapeConfig", + plural = "scrapeconfigs", + namespaced, + derive = "Default" +)] +#[serde(rename_all = "camelCase")] +pub struct ScrapeConfigSpec { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub job_name: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub metrics_path: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scrape_interval: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scrape_timeout: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scheme: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub static_configs: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub relabelings: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub metric_relabelings: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct StaticConfig { + /// targets: ["host:port"] + pub targets: Vec, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub labels: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct RelabelConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub separator: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_label: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub replacement: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub action: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub regex: Option, +} diff --git a/harmony/src/modules/monitoring/okd/crd/service_monitor.rs b/harmony/src/modules/monitoring/okd/crd/service_monitor.rs new file mode 100644 index 00000000..f60f1399 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/crd/service_monitor.rs @@ -0,0 +1,97 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +#[derive(CustomResource, Debug, Serialize, Deserialize, Clone, JsonSchema, Default)] +#[kube( + group = "monitoring.coreos.com", + version = "v1", + kind = "ServiceMonitor", + plural = "servicemonitors", + namespaced, + derive = "Default" +)] +#[serde(rename_all = "camelCase")] +pub struct ServiceMonitorSpec { + /// The label to use to retrieve the job name from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub job_label: Option, + + /// TargetLabels transfers labels on the Kubernetes Service onto the target. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_labels: Option>, + + /// PodTargetLabels transfers labels on the Kubernetes Pod onto the target. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pod_target_labels: Option>, + + /// A list of endpoints allowed as part of this ServiceMonitor. + pub endpoints: Vec, + + /// Selector to select Endpoints objects. + pub selector: LabelSelector, + + /// Selector to select which namespaces the Kubernetes Endpoints objects are discovered from. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub namespace_selector: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct Endpoint { + /// Name of the service port this endpoint refers to. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub port: Option, + + /// HTTP path to scrape for metrics. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, + + /// HTTP scheme to use for scraping. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scheme: Option, + + /// Interval at which metrics should be scraped. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub interval: Option, + + /// Timeout after which the scrape is ended. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub scrape_timeout: Option, + + /// HonorLabels chooses the metric's labels on collisions with target labels. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub honor_labels: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelector { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub match_labels: Option>, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub match_expressions: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct LabelSelectorRequirement { + pub key: String, + pub operator: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub values: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)] +#[serde(rename_all = "camelCase")] +pub struct NamespaceSelector { + /// Boolean describing whether all namespaces are selected in contrast to a list restricting them. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub any: Option, + + /// List of namespace names. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub match_names: Option>, +} diff --git a/harmony/src/modules/monitoring/okd/enable_user_workload.rs b/harmony/src/modules/monitoring/okd/enable_user_workload.rs deleted file mode 100644 index 2ed0fa41..00000000 --- a/harmony/src/modules/monitoring/okd/enable_user_workload.rs +++ /dev/null @@ -1,60 +0,0 @@ -use crate::{ - data::Version, - interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, - inventory::Inventory, - modules::monitoring::okd::config::Config, - score::Score, - topology::{K8sclient, Topology}, -}; -use async_trait::async_trait; -use harmony_types::id::Id; -use serde::Serialize; - -#[derive(Clone, Debug, Serialize)] -pub struct OpenshiftUserWorkloadMonitoring {} - -impl Score for OpenshiftUserWorkloadMonitoring { - fn name(&self) -> String { - "OpenshiftUserWorkloadMonitoringScore".to_string() - } - - fn create_interpret(&self) -> Box> { - Box::new(OpenshiftUserWorkloadMonitoringInterpret {}) - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct OpenshiftUserWorkloadMonitoringInterpret {} - -#[async_trait] -impl Interpret for OpenshiftUserWorkloadMonitoringInterpret { - async fn execute( - &self, - _inventory: &Inventory, - topology: &T, - ) -> Result { - let client = topology.k8s_client().await.unwrap(); - Config::create_cluster_monitoring_config_cm(&client).await?; - Config::create_user_workload_monitoring_config_cm(&client).await?; - Config::verify_user_workload(&client).await?; - Ok(Outcome::success( - "successfully enabled user-workload-monitoring".to_string(), - )) - } - - fn get_name(&self) -> InterpretName { - InterpretName::Custom("OpenshiftUserWorkloadMonitoring") - } - - fn get_version(&self) -> Version { - todo!() - } - - fn get_status(&self) -> InterpretStatus { - todo!() - } - - fn get_children(&self) -> Vec { - todo!() - } -} diff --git a/harmony/src/modules/monitoring/okd/mod.rs b/harmony/src/modules/monitoring/okd/mod.rs index ac246c5f..62d00745 100644 --- a/harmony/src/modules/monitoring/okd/mod.rs +++ b/harmony/src/modules/monitoring/okd/mod.rs @@ -1,10 +1,17 @@ -use crate::topology::oberservability::monitoring::AlertSender; +use serde::Serialize; -pub mod cluster_monitoring; -pub(crate) mod config; -pub mod enable_user_workload; +use crate::topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget}; -#[derive(Debug)] +pub mod crd; +pub mod openshift_cluster_alerting_score; +pub mod score_enable_cluster_monitoring; +pub mod score_openshift_alert_rule; +pub mod score_openshift_receiver; +pub mod score_openshift_scrape_target; +pub mod score_user_workload; +pub mod score_verify_user_workload_monitoring; + +#[derive(Debug, Clone, Serialize)] pub struct OpenshiftClusterAlertSender; impl AlertSender for OpenshiftClusterAlertSender { @@ -12,3 +19,30 @@ impl AlertSender for OpenshiftClusterAlertSender { "OpenshiftClusterAlertSender".to_string() } } + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/okd/openshift_cluster_alerting_score.rs b/harmony/src/modules/monitoring/okd/openshift_cluster_alerting_score.rs new file mode 100644 index 00000000..ee801bb8 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/openshift_cluster_alerting_score.rs @@ -0,0 +1,36 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::okd::OpenshiftClusterAlertSender, + score::Score, + topology::{ + Topology, + monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget}, + }, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct OpenshiftClusterAlertScore { + pub sender: OpenshiftClusterAlertSender, + pub receivers: Vec>>, + pub rules: Vec>>, + pub scrape_targets: Option>>>, +} + +impl> Score + for OpenshiftClusterAlertScore +{ + fn name(&self) -> String { + "OpenshiftClusterAlertScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(AlertingInterpret { + sender: OpenshiftClusterAlertSender, + receivers: self.receivers.clone(), + rules: self.rules.clone(), + scrape_targets: self.scrape_targets.clone(), + }) + } +} diff --git a/harmony/src/modules/monitoring/okd/score_enable_cluster_monitoring.rs b/harmony/src/modules/monitoring/okd/score_enable_cluster_monitoring.rs new file mode 100644 index 00000000..0f6d1ba4 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_enable_cluster_monitoring.rs @@ -0,0 +1,150 @@ +use std::{collections::BTreeMap, sync::Arc}; + +use async_trait::async_trait; +use harmony_k8s::K8sClient; +use harmony_types::id::Id; +use k8s_openapi::api::core::v1::ConfigMap; +use kube::api::{GroupVersionKind, ObjectMeta}; +use log::debug; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::k8s::resource::K8sResourceScore, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftEnableClusterMonitoringScore {} + +impl Score for OpenshiftEnableClusterMonitoringScore { + fn name(&self) -> String { + "OpenshiftClusterMonitoringScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftEnableClusterMonitoringInterpret {}) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftEnableClusterMonitoringInterpret {} + +#[async_trait] +impl Interpret for OpenshiftEnableClusterMonitoringInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let namespace = "openshift-monitoring".to_string(); + let name = "cluster-monitoring-config".to_string(); + let client = topology.k8s_client().await?; + let enabled = self + .check_cluster_monitoring_enabled(client, &name, &namespace) + .await + .map_err(|e| InterpretError::new(e))?; + + debug!("enabled {:#?}", enabled); + + match enabled { + true => Ok(Outcome::success( + "Openshift Cluster Monitoring already enabled".to_string(), + )), + false => { + let mut data = BTreeMap::new(); + data.insert( + "config.yaml".to_string(), + r#" +enableUserWorkload: true +alertmanagerMain: + enableUserAlertmanagerConfig: true +"# + .to_string(), + ); + + let cm = ConfigMap { + metadata: ObjectMeta { + name: Some(name), + namespace: Some(namespace.clone()), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }; + K8sResourceScore::single(cm, Some(namespace)) + .create_interpret() + .execute(inventory, topology) + .await?; + + Ok(Outcome::success( + "Successfully enabled Openshift Cluster Monitoring".to_string(), + )) + } + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftEnableClusterMonitoringInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl OpenshiftEnableClusterMonitoringInterpret { + async fn check_cluster_monitoring_enabled( + &self, + client: Arc, + name: &str, + namespace: &str, + ) -> Result { + let gvk = GroupVersionKind { + group: "".to_string(), + version: "v1".to_string(), + kind: "ConfigMap".to_string(), + }; + + let cm = match client + .get_resource_json_value(name, Some(namespace), &gvk) + .await + { + Ok(obj) => obj, + Err(_) => return Ok(false), + }; + + debug!("{:#?}", cm.data.pointer("/data/config.yaml")); + let config_yaml_str = match cm + .data + .pointer("/data/config.yaml") + .and_then(|v| v.as_str()) + { + Some(s) => s, + None => return Ok(false), + }; + + debug!("{:#?}", config_yaml_str); + let parsed_config: serde_yaml::Value = serde_yaml::from_str(config_yaml_str) + .map_err(|e| format!("Failed to parse nested YAML: {}", e))?; + + let enabled = parsed_config + .get("enableUserWorkload") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + debug!("{:#?}", enabled); + Ok(enabled) + } +} diff --git a/harmony/src/modules/monitoring/okd/score_openshift_alert_rule.rs b/harmony/src/modules/monitoring/okd/score_openshift_alert_rule.rs new file mode 100644 index 00000000..9c902c66 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_openshift_alert_rule.rs @@ -0,0 +1,42 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::okd::{ + OpenshiftClusterAlertSender, + crd::alerting_rules::{AlertingRule, AlertingRuleSpec}, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertRule}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct OpenshiftAlertRuleScore { + pub rule: Box>, +} + +impl Score for OpenshiftAlertRuleScore { + fn name(&self) -> String { + "OpenshiftAlertingRuleScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let namespace = "openshift-monitoring".to_string(); + let alerting_rule = AlertingRule { + metadata: ObjectMeta { + name: Some(self.rule.name()), + namespace: Some(namespace.clone()), + ..Default::default() + }, + spec: AlertingRuleSpec { + groups: self.rule.build_rule().unwrap(), + }, + }; + + K8sResourceScore::single(alerting_rule, Some(namespace)).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/okd/score_openshift_receiver.rs b/harmony/src/modules/monitoring/okd/score_openshift_receiver.rs new file mode 100644 index 00000000..92af4497 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_openshift_receiver.rs @@ -0,0 +1,213 @@ +use async_trait::async_trait; +use base64::{Engine as _, prelude::BASE64_STANDARD}; +use harmony_types::id::Id; +use kube::api::DynamicObject; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::okd::OpenshiftClusterAlertSender, + score::Score, + topology::{ + K8sclient, Topology, + monitoring::{AlertReceiver, AlertRoute, MatchOp}, + }, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct OpenshiftReceiverScore { + pub receiver: Box>, +} + +impl Score for OpenshiftReceiverScore { + fn name(&self) -> String { + "OpenshiftAlertReceiverScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftReceiverInterpret { + receiver: self.receiver.clone(), + }) + } +} + +#[derive(Debug)] +pub struct OpenshiftReceiverInterpret { + receiver: Box>, +} + +#[async_trait] +impl Interpret for OpenshiftReceiverInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await?; + let ns = "openshift-monitoring"; + + let mut am_secret: DynamicObject = client + .get_secret_json_value("alertmanager-main", Some(ns)) + .await?; + + let data = am_secret + .data + .get_mut("data") + .ok_or_else(|| { + InterpretError::new("Missing 'data' field in alertmanager-main secret".into()) + })? + .as_object_mut() + .ok_or_else(|| InterpretError::new("'data' field must be a JSON object".into()))?; + + let config_b64 = data + .get("alertmanager.yaml") + .and_then(|v| v.as_str()) + .unwrap_or_default(); + + let config_bytes = BASE64_STANDARD.decode(config_b64).unwrap_or_default(); + + let mut am_config: serde_yaml::Value = serde_yaml::from_slice(&config_bytes) + .unwrap_or_else(|_| serde_yaml::Value::Mapping(serde_yaml::Mapping::new())); + + let name = self.receiver.name(); + let install_plan = self.receiver.build().expect("failed to build install plan"); + let receiver = install_plan.receiver.expect("unable to find receiver path"); + + let alert_route = install_plan + .route + .ok_or_else(|| InterpretError::new("missing route".into()))?; + + let route = self.serialize_route(&alert_route); + + let route = serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))?; + + if am_config.get("receivers").is_none() { + am_config["receivers"] = serde_yaml::Value::Sequence(vec![]); + } + if am_config.get("route").is_none() { + am_config["route"] = serde_yaml::Value::Mapping(serde_yaml::Mapping::new()); + } + if am_config["route"].get("routes").is_none() { + am_config["route"]["routes"] = serde_yaml::Value::Sequence(vec![]); + } + + { + let receivers_seq = am_config["receivers"].as_sequence_mut().unwrap(); + if let Some(idx) = receivers_seq + .iter() + .position(|r| r.get("name").and_then(|n| n.as_str()) == Some(&name)) + { + receivers_seq[idx] = receiver; + } else { + receivers_seq.push(receiver); + } + } + + { + let route_seq = am_config["route"]["routes"].as_sequence_mut().unwrap(); + if let Some(idx) = route_seq + .iter() + .position(|r| r.get("receiver").and_then(|n| n.as_str()) == Some(&name)) + { + route_seq[idx] = route; + } else { + route_seq.push(route); + } + } + + let yaml_str = + serde_yaml::to_string(&am_config).map_err(|e| InterpretError::new(e.to_string()))?; + + let mut yaml_b64 = String::new(); + + BASE64_STANDARD.encode_string(yaml_str, &mut yaml_b64); + data.insert( + "alertmanager.yaml".to_string(), + serde_json::Value::String(yaml_b64), + ); + am_secret.metadata.managed_fields = None; + + client.apply_dynamic(&am_secret, Some(ns), true).await?; + + Ok(Outcome::success(format!( + "Configured OpenShift cluster alert receiver: {}", + name + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftAlertReceiverInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} +impl OpenshiftReceiverInterpret { + fn serialize_route(&self, route: &AlertRoute) -> serde_yaml::Value { + // Convert matchers + let matchers: Vec = route + .matchers + .iter() + .map(|m| match m.operator { + MatchOp::Eq => format!("{} = {}", m.label, m.value), + MatchOp::NotEq => format!("{} != {}", m.label, m.value), + MatchOp::Regex => format!("{} =~ {}", m.label, m.value), + }) + .collect(); + + // Recursively convert children routes + let children: Vec = route + .children + .iter() + .map(|c| self.serialize_route(c)) + .collect(); + + // Build the YAML object for this route + let mut route_map = serde_yaml::Mapping::new(); + route_map.insert( + serde_yaml::Value::String("receiver".to_string()), + serde_yaml::Value::String(route.receiver.clone()), + ); + if !matchers.is_empty() { + route_map.insert( + serde_yaml::Value::String("matchers".to_string()), + serde_yaml::to_value(matchers).unwrap(), + ); + } + if !route.group_by.is_empty() { + route_map.insert( + serde_yaml::Value::String("group_by".to_string()), + serde_yaml::to_value(route.group_by.clone()).unwrap(), + ); + } + if let Some(ref interval) = route.repeat_interval { + route_map.insert( + serde_yaml::Value::String("repeat_interval".to_string()), + serde_yaml::Value::String(interval.clone()), + ); + } + route_map.insert( + serde_yaml::Value::String("continue".to_string()), + serde_yaml::Value::Bool(route.continue_matching), + ); + if !children.is_empty() { + route_map.insert( + serde_yaml::Value::String("routes".to_string()), + serde_yaml::Value::Sequence(children), + ); + } + + serde_yaml::Value::Mapping(route_map) + } +} diff --git a/harmony/src/modules/monitoring/okd/score_openshift_scrape_target.rs b/harmony/src/modules/monitoring/okd/score_openshift_scrape_target.rs new file mode 100644 index 00000000..44b0106c --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_openshift_scrape_target.rs @@ -0,0 +1,188 @@ +use std::collections::BTreeMap; + +use async_trait::async_trait; +use harmony_types::id::Id; +use k8s_openapi::{ + api::core::v1::{ + EndpointAddress, EndpointPort, EndpointSubset, Endpoints, Service, ServicePort, ServiceSpec, + }, + apimachinery::pkg::util::intstr::IntOrString, +}; +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::okd::{ + OpenshiftClusterAlertSender, + crd::service_monitor::{Endpoint, LabelSelector, ServiceMonitor, ServiceMonitorSpec}, + }, + }, + score::Score, + topology::{ + K8sclient, Topology, + monitoring::{ExternalScrapeTarget, ScrapeTarget}, + }, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct OpenshiftScrapeTargetScore { + pub scrape_target: Box>, +} + +impl Score for OpenshiftScrapeTargetScore { + fn name(&self) -> String { + "OpenshiftAlertingRuleScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftScrapeTargetInterpret { + scrape_target: self.scrape_target.clone(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct OpenshiftScrapeTargetInterpret { + scrape_target: Box>, +} + +#[async_trait] +impl Interpret for OpenshiftScrapeTargetInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let namespace = "openshift-monitoring".to_string(); + let name = self.scrape_target.name(); + let external_target = self + .scrape_target + .build_scrape_target() + .expect("failed to build scrape target ExternalScrapeTarget"); + + let (service, endpoints, service_monitor) = + self.to_k8s_resources(&name, &namespace, external_target); + + K8sResourceScore::single(service, Some(namespace.clone())) + .create_interpret() + .execute(inventory, topology) + .await?; + + K8sResourceScore::single(endpoints, Some(namespace.clone())) + .create_interpret() + .execute(inventory, topology) + .await?; + + K8sResourceScore::single(service_monitor, Some(namespace.clone())) + .create_interpret() + .execute(inventory, topology) + .await?; + + Ok(Outcome::success( + "Installed scrape target of Openshift".to_string(), + )) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftScrapeTargetInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl OpenshiftScrapeTargetInterpret { + /// Maps the generic intent into the 3 required Kubernetes objects + pub fn to_k8s_resources( + &self, + name: &str, + namespace: &str, + external_target: ExternalScrapeTarget, + ) -> (Service, Endpoints, ServiceMonitor) { + let mut labels = external_target.labels.clone().unwrap_or(BTreeMap::new()); + + labels.insert("harmony/target-name".to_string(), name.to_string().clone()); + + let service = Service { + metadata: ObjectMeta { + name: Some(name.to_string().clone()), + namespace: Some(namespace.to_string()), + labels: Some(labels.clone()), + ..Default::default() + }, + spec: Some(ServiceSpec { + cluster_ip: Some("None".to_string()), // Headless + ports: Some(vec![ServicePort { + name: Some("metrics".to_string()), + port: external_target.port.clone(), + target_port: Some(IntOrString::Int(external_target.port)), + ..Default::default() + }]), + ..Default::default() + }), + ..Default::default() + }; + + let endpoints = Endpoints { + metadata: ObjectMeta { + name: Some(name.to_string().clone()), + namespace: Some(namespace.to_string()), + labels: Some(labels.clone()), + ..Default::default() + }, + subsets: Some(vec![EndpointSubset { + addresses: Some(vec![EndpointAddress { + ip: external_target.ip.to_string().clone(), + ..Default::default() + }]), + ports: Some(vec![EndpointPort { + name: Some("metrics".to_string()), + port: external_target.port, + ..Default::default() + }]), + ..Default::default() + }]), + }; + + let service_monitor = ServiceMonitor { + metadata: ObjectMeta { + name: Some(name.to_string().clone()), + namespace: Some(namespace.to_string()), + ..Default::default() + }, + spec: ServiceMonitorSpec { + job_label: Some("harmony/target-name".to_string()), + endpoints: vec![Endpoint { + port: Some("metrics".to_string()), + interval: external_target.interval.clone(), + path: external_target.path.clone(), + ..Default::default() + }], + selector: LabelSelector { + match_labels: Some(BTreeMap::from([( + "harmony/target-name".to_string(), + name.to_string().clone(), + )])), + ..Default::default() + }, + ..Default::default() + }, + }; + + (service, endpoints, service_monitor) + } +} diff --git a/harmony/src/modules/monitoring/okd/score_user_workload.rs b/harmony/src/modules/monitoring/okd/score_user_workload.rs new file mode 100644 index 00000000..aa1cf706 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_user_workload.rs @@ -0,0 +1,158 @@ +use std::{collections::BTreeMap, sync::Arc}; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::k8s::resource::K8sResourceScore, + score::Score, + topology::{K8sclient, Topology}, +}; +use async_trait::async_trait; +use harmony_k8s::K8sClient; +use harmony_types::id::Id; +use k8s_openapi::api::core::v1::ConfigMap; +use kube::api::{GroupVersionKind, ObjectMeta}; +use log::debug; +use serde::Serialize; + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftUserWorkloadMonitoring {} + +impl Score for OpenshiftUserWorkloadMonitoring { + fn name(&self) -> String { + "OpenshiftUserWorkloadMonitoringScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(OpenshiftUserWorkloadMonitoringInterpret {}) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct OpenshiftUserWorkloadMonitoringInterpret {} + +#[async_trait] +impl Interpret for OpenshiftUserWorkloadMonitoringInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let namespace = "openshift-user-workload-monitoring".to_string(); + let cm_name = "user-workload-monitoring-config".to_string(); + let client = topology.k8s_client().await?; + + let cm_enabled = self + .check_cluster_user_workload_monitoring_enabled(client, &cm_name, &namespace) + .await?; + + match cm_enabled { + true => Ok(Outcome::success( + "OpenshiftUserWorkloadMonitoringEnabled".to_string(), + )), + false => { + let mut data = BTreeMap::new(); + data.insert( + "config.yaml".to_string(), + r#" +alertmanager: + enabled: true + enableAlertmanagerConfig: true +"# + .to_string(), + ); + + let cm = ConfigMap { + metadata: ObjectMeta { + name: Some("user-workload-monitoring-config".to_string()), + namespace: Some(namespace.clone()), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }; + + K8sResourceScore::single(cm, Some(namespace)) + .create_interpret() + .execute(inventory, topology) + .await + } + } + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("OpenshiftUserWorkloadMonitoringInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} + +impl OpenshiftUserWorkloadMonitoringInterpret { + async fn check_cluster_user_workload_monitoring_enabled( + &self, + client: Arc, + name: &str, + namespace: &str, + ) -> Result { + let gvk = GroupVersionKind { + group: "".to_string(), + version: "v1".to_string(), + kind: "ConfigMap".to_string(), + }; + + let cm = match client + .get_resource_json_value(name, Some(namespace), &gvk) + .await + { + Ok(obj) => obj, + Err(_) => return Ok(false), // CM doesn't exist? Treat as disabled. + }; + + debug!("{:#?}", cm.data.pointer("/data/config.yaml")); + let config_yaml_str = match cm + .data + .pointer("/data/config.yaml") + .and_then(|v| v.as_str()) + { + Some(s) => s, + None => return Ok(false), // Key missing? Treat as disabled. + }; + + debug!("{:#?}", config_yaml_str); + let parsed_config: serde_yaml::Value = serde_yaml::from_str(config_yaml_str) + .map_err(|e| format!("Failed to parse nested YAML: {}", e))?; + + let alert_manager_enabled = parsed_config + .get("alertmanager") + .and_then(|a| a.get("enableAlertmanagerConfig")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + debug!("alertmanagerenabled: {:#?}", alert_manager_enabled); + + let enabled = parsed_config + .get("alertmanager") + .and_then(|enabled| enabled.get("enabled")) + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + debug!("user workload monitoring enabled: {:#?}", enabled); + + if alert_manager_enabled && enabled == true { + Ok(true) + } else { + Ok(false) + } + } +} diff --git a/harmony/src/modules/monitoring/okd/score_verify_user_workload_monitoring.rs b/harmony/src/modules/monitoring/okd/score_verify_user_workload_monitoring.rs new file mode 100644 index 00000000..29786763 --- /dev/null +++ b/harmony/src/modules/monitoring/okd/score_verify_user_workload_monitoring.rs @@ -0,0 +1,70 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct VerifyUserWorkload {} + +impl Score for VerifyUserWorkload { + fn name(&self) -> String { + "VerifyUserWorkload".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(VerifyUserWorkloadInterpret {}) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct VerifyUserWorkloadInterpret {} + +#[async_trait] +impl Interpret for VerifyUserWorkloadInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await?; + let namespace = "openshift-user-workload-monitoring"; + let alertmanager_name = "alertmanager-user-workload-0"; + let prometheus_name = "prometheus-user-workload-0"; + + client + .wait_for_pod_ready(alertmanager_name, Some(namespace)) + .await?; + + client + .wait_for_pod_ready(prometheus_name, Some(namespace)) + .await?; + + Ok(Outcome::success(format!( + "pods: {}, {} ready in ns: {}", + alertmanager_name, prometheus_name, namespace + ))) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("VerifyUserWorkloadInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/helm/mod.rs b/harmony/src/modules/monitoring/prometheus/helm/mod.rs index 431fc6ca..99a01e51 100644 --- a/harmony/src/modules/monitoring/prometheus/helm/mod.rs +++ b/harmony/src/modules/monitoring/prometheus/helm/mod.rs @@ -1 +1,2 @@ +pub mod prometheus_config; pub mod prometheus_helm; diff --git a/harmony/src/modules/monitoring/prometheus/prometheus_config.rs b/harmony/src/modules/monitoring/prometheus/helm/prometheus_config.rs similarity index 95% rename from harmony/src/modules/monitoring/prometheus/prometheus_config.rs rename to harmony/src/modules/monitoring/prometheus/helm/prometheus_config.rs index 32c3fab5..031d1ac1 100644 --- a/harmony/src/modules/monitoring/prometheus/prometheus_config.rs +++ b/harmony/src/modules/monitoring/prometheus/helm/prometheus_config.rs @@ -1,8 +1,10 @@ +use serde::Serialize; + use crate::modules::monitoring::kube_prometheus::types::{ AlertManagerAdditionalPromRules, AlertManagerChannelConfig, ServiceMonitor, }; -#[derive(Debug)] +#[derive(Debug, Clone, Serialize)] pub struct PrometheusConfig { pub namespace: Option, pub default_rules: bool, diff --git a/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs b/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs index 611c500f..b0305a8d 100644 --- a/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs +++ b/harmony/src/modules/monitoring/prometheus/helm/prometheus_helm.rs @@ -3,9 +3,8 @@ use std::sync::{Arc, Mutex}; use non_blank_string_rs::NonBlankString; -use crate::modules::{ - helm::chart::HelmChartScore, monitoring::prometheus::prometheus_config::PrometheusConfig, -}; +use crate::modules::helm::chart::HelmChartScore; +use crate::modules::monitoring::prometheus::helm::prometheus_config::PrometheusConfig; pub fn prometheus_helm_chart_score(config: Arc>) -> HelmChartScore { let config = config.lock().unwrap(); @@ -30,6 +29,7 @@ server: fullnameOverride: prometheus-{ns} "# ); + HelmChartScore { namespace: Some(NonBlankString::from_str(&config.namespace.clone().unwrap()).unwrap()), release_name: NonBlankString::from_str("prometheus").unwrap(), diff --git a/harmony/src/modules/monitoring/prometheus/mod.rs b/harmony/src/modules/monitoring/prometheus/mod.rs index d1bda0e2..0cf651a6 100644 --- a/harmony/src/modules/monitoring/prometheus/mod.rs +++ b/harmony/src/modules/monitoring/prometheus/mod.rs @@ -1,4 +1,56 @@ +use std::sync::{Arc, Mutex}; + +use async_trait::async_trait; +use serde::Serialize; + +use crate::{ + modules::monitoring::prometheus::helm::prometheus_config::PrometheusConfig, + topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget}, +}; + pub mod helm; -#[allow(clippy::module_inception)] -pub mod prometheus; -pub mod prometheus_config; +pub mod prometheus_alerting_score; +pub mod score_prometheus_alert_receivers; +pub mod score_prometheus_ensure_ready; +pub mod score_prometheus_install; +pub mod score_prometheus_rule; +pub mod score_prometheus_scrape_target; + +#[derive(Debug, Clone, Serialize)] +pub struct Prometheus { + pub config: Arc>, +} + +#[async_trait] +impl AlertSender for Prometheus { + fn name(&self) -> String { + "Prometheus".to_string() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/prometheus.rs b/harmony/src/modules/monitoring/prometheus/prometheus.rs deleted file mode 100644 index 2fe0d06c..00000000 --- a/harmony/src/modules/monitoring/prometheus/prometheus.rs +++ /dev/null @@ -1,194 +0,0 @@ -use std::sync::{Arc, Mutex}; - -use async_trait::async_trait; -use log::{debug, error}; -use serde::Serialize; - -use crate::{ - interpret::{InterpretError, Outcome}, - inventory::Inventory, - modules::monitoring::{ - alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, - grafana::helm::helm_grafana::grafana_helm_chart_score, - kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig}, - }, - score::Score, - topology::{ - HelmCommand, Topology, - installable::Installable, - oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, - tenant::TenantManager, - }, -}; - -use super::{ - helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::PrometheusConfig, -}; - -#[derive(Debug)] -pub struct Prometheus { - pub config: Arc>, -} - -#[async_trait] -impl AlertSender for Prometheus { - fn name(&self) -> String { - "Prometheus".to_string() - } -} - -impl Default for Prometheus { - fn default() -> Self { - Self::new() - } -} - -impl Prometheus { - pub fn new() -> Self { - Self { - config: Arc::new(Mutex::new(PrometheusConfig::new())), - } - } - pub async fn configure_with_topology(&self, topology: &T) { - let ns = topology - .get_tenant_config() - .await - .map(|cfg| cfg.name.clone()) - .unwrap_or_else(|| "monitoring".to_string()); - error!("This must be refactored, see comments in pr #74"); - debug!("NS: {}", ns); - self.config.lock().unwrap().namespace = Some(ns); - } - - pub async fn install_receiver( - &self, - prometheus_receiver: &dyn PrometheusReceiver, - ) -> Result { - let prom_receiver = prometheus_receiver.configure_receiver().await; - debug!( - "adding alert receiver to prometheus config: {:#?}", - &prom_receiver - ); - let mut config = self.config.lock().unwrap(); - - config.alert_receiver_configs.push(prom_receiver); - let prom_receiver_name = prometheus_receiver.name(); - debug!("installed alert receiver {}", &prom_receiver_name); - Ok(Outcome::success(format!( - "Sucessfully installed receiver {}", - prom_receiver_name - ))) - } - - pub async fn install_rule( - &self, - prometheus_rule: &AlertManagerRuleGroup, - ) -> Result { - let prometheus_rule = prometheus_rule.configure_rule().await; - let mut config = self.config.lock().unwrap(); - - config.alert_rules.push(prometheus_rule.clone()); - Ok(Outcome::success(format!( - "Successfully installed alert rule: {:#?},", - prometheus_rule - ))) - } - - pub async fn install_prometheus( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result { - prometheus_helm_chart_score(self.config.clone()) - .interpret(inventory, topology) - .await - } - pub async fn install_grafana( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result { - let namespace = { - let config = self.config.lock().unwrap(); - config.namespace.clone() - }; - - if let Some(ns) = namespace.as_deref() { - grafana_helm_chart_score(ns, false) - .interpret(inventory, topology) - .await - } else { - Err(InterpretError::new( - "could not install grafana, missing namespace".to_string(), - )) - } - } -} -#[async_trait] -impl Installable for Prometheus { - async fn configure(&self, _inventory: &Inventory, topology: &T) -> Result<(), InterpretError> { - self.configure_with_topology(topology).await; - Ok(()) - } - - async fn ensure_installed( - &self, - inventory: &Inventory, - topology: &T, - ) -> Result<(), InterpretError> { - self.install_prometheus(inventory, topology).await?; - - let install_grafana = { - let config = self.config.lock().unwrap(); - config.grafana - }; - - if install_grafana { - self.install_grafana(inventory, topology).await?; - } - - Ok(()) - } -} - -#[async_trait] -pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug { - fn name(&self) -> String; - async fn configure_receiver(&self) -> AlertManagerChannelConfig; -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} - -#[async_trait] -pub trait PrometheusRule: Send + Sync + std::fmt::Debug { - fn name(&self) -> String; - async fn configure_rule(&self) -> AlertManagerAdditionalPromRules; -} - -impl Serialize for Box> { - fn serialize(&self, _serializer: S) -> Result - where - S: serde::Serializer, - { - todo!() - } -} - -impl Clone for Box> { - fn clone(&self) -> Self { - self.clone_box() - } -} diff --git a/harmony/src/modules/monitoring/prometheus/prometheus_alerting_score.rs b/harmony/src/modules/monitoring/prometheus/prometheus_alerting_score.rs new file mode 100644 index 00000000..52d9d8cc --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/prometheus_alerting_score.rs @@ -0,0 +1,48 @@ +use std::sync::{Arc, Mutex}; + +use serde::Serialize; + +use crate::{ + modules::monitoring::{ + kube_prometheus::types::ServiceMonitor, + prometheus::{Prometheus, helm::prometheus_config::PrometheusConfig}, + }, + score::Score, + topology::{ + Topology, + monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget}, + }, +}; + +//TODO untested +#[derive(Clone, Debug, Serialize)] +pub struct PrometheusAlertingScore { + pub receivers: Vec>>, + pub rules: Vec>>, + pub scrape_targets: Option>>>, + pub service_monitors: Vec, + pub config: Arc>, +} + +impl> Score for PrometheusAlertingScore { + fn create_interpret(&self) -> Box> { + //TODO test that additional service monitor is added + self.config + .try_lock() + .expect("couldn't lock config") + .additional_service_monitors = self.service_monitors.clone(); + + Box::new(AlertingInterpret { + sender: Prometheus { + config: self.config.clone(), + }, + receivers: self.receivers.clone(), + rules: self.rules.clone(), + scrape_targets: self.scrape_targets.clone(), + }) + } + + fn name(&self) -> String { + "HelmPrometheusAlertingScore".to_string() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/score_prometheus_alert_receivers.rs b/harmony/src/modules/monitoring/prometheus/score_prometheus_alert_receivers.rs new file mode 100644 index 00000000..161f03b6 --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/score_prometheus_alert_receivers.rs @@ -0,0 +1,60 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::{ + kube_prometheus::crd::crd_alertmanager_config::{ + AlertmanagerConfig, AlertmanagerConfigSpec, + }, + prometheus::Prometheus, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertReceiver}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusReceiverScore { + pub sender: Prometheus, + pub receiver: Box>, +} + +impl Score for PrometheusReceiverScore { + fn name(&self) -> String { + "PrometheusReceiverScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.receiver.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + + let install_plan = self.receiver.build().expect("failed to build install plan"); + + let route = install_plan.route; + + let route = serde_json::to_value(route).expect("failed to serialize to json"); + + let receiver = install_plan + .receiver + .expect("failed to find receiver mapping"); + + let data = serde_json::json!({ + "route": route, + "receivers": [receiver] + }); + + let alertmanager_config = AlertmanagerConfig { + metadata: ObjectMeta { + name: Some(name), + namespace: namespace.clone(), + ..Default::default() + }, + spec: AlertmanagerConfigSpec { data: data }, + }; + + K8sResourceScore::single(alertmanager_config, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/score_prometheus_ensure_ready.rs b/harmony/src/modules/monitoring/prometheus/score_prometheus_ensure_ready.rs new file mode 100644 index 00000000..7bb14113 --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/score_prometheus_ensure_ready.rs @@ -0,0 +1,80 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::prometheus::Prometheus, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Clone, Debug, Serialize)] +pub struct PrometheusEnsureReadyScore { + pub sender: Prometheus, +} + +impl Score for PrometheusEnsureReadyScore { + fn name(&self) -> String { + "PrometheusEnsureReadyScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(PrometheusEnsureReadyInterpret { + sender: self.sender.clone(), + }) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct PrometheusEnsureReadyInterpret { + pub sender: Prometheus, +} + +#[async_trait] +impl Interpret for PrometheusEnsureReadyInterpret { + async fn execute( + &self, + _inventory: &Inventory, + topology: &T, + ) -> Result { + let client = topology.k8s_client().await?; + let namespace = self + .sender + .config + .lock() + .unwrap() + .namespace + .clone() + .unwrap_or("default".to_string()); + + let prometheus_name = "prometheues-prometheus-operator"; + + client + .wait_until_deployment_ready(prometheus_name, Some(&namespace), None) + .await?; + + Ok(Outcome::success(format!( + "deployment: {} ready in ns: {}", + prometheus_name, namespace + ))) + } + + fn get_name(&self) -> InterpretName { + todo!() + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/score_prometheus_install.rs b/harmony/src/modules/monitoring/prometheus/score_prometheus_install.rs new file mode 100644 index 00000000..39eeef2c --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/score_prometheus_install.rs @@ -0,0 +1,65 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::monitoring::prometheus::{ + Prometheus, helm::prometheus_helm::prometheus_helm_chart_score, + }, + score::Score, + topology::{HelmCommand, K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusInstallScore { + pub sender: Prometheus, +} + +impl Score for PrometheusInstallScore { + fn name(&self) -> String { + "PrometheusInstallScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(PrometheusInstallInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusInstallInterpret { + score: PrometheusInstallScore, +} + +#[async_trait] +impl Interpret for PrometheusInstallInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + //TODO add interpret to install service monitors CRD from sender Prometheus + let score = prometheus_helm_chart_score(self.score.sender.config.clone()); + score.create_interpret().execute(inventory, topology).await + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("PrometheusInstallInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/score_prometheus_rule.rs b/harmony/src/modules/monitoring/prometheus/score_prometheus_rule.rs new file mode 100644 index 00000000..c9840c16 --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/score_prometheus_rule.rs @@ -0,0 +1,48 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::{ + kube_prometheus::crd::crd_prometheus_rules::{ + PrometheusRule, PrometheusRuleSpec, RuleGroup, + }, + prometheus::Prometheus, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::AlertRule}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusRuleScore { + pub sender: Prometheus, + pub rule: Box>, +} + +impl Score for PrometheusRuleScore { + fn name(&self) -> String { + "PrometheusRuleScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.rule.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + let groups: Vec = + serde_json::from_value(self.rule.build_rule().expect("failed to build alert rule")) + .expect("failed to serialize rule group"); + + let prometheus_rule = PrometheusRule { + metadata: ObjectMeta { + name: Some(name.clone()), + namespace: namespace.clone(), + ..Default::default() + }, + + spec: PrometheusRuleSpec { groups }, + }; + K8sResourceScore::single(prometheus_rule, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/prometheus/score_prometheus_scrape_target.rs b/harmony/src/modules/monitoring/prometheus/score_prometheus_scrape_target.rs new file mode 100644 index 00000000..df4db9eb --- /dev/null +++ b/harmony/src/modules/monitoring/prometheus/score_prometheus_scrape_target.rs @@ -0,0 +1,64 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::topology::monitoring::AlertRule; +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::{ + kube_prometheus::crd::crd_scrape_config::{ + ScrapeConfig, ScrapeConfigSpec, StaticConfig, + }, + prometheus::Prometheus, + }, + }, + score::Score, + topology::{K8sclient, Topology, monitoring::ScrapeTarget}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusScrapeTargetScore { + pub sender: Prometheus, + pub scrape_target: Box>, +} + +impl Score for PrometheusScrapeTargetScore { + fn name(&self) -> String { + "PrometheusScrapeTargetScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let name = self.scrape_target.name(); + let namespace = self.sender.config.lock().unwrap().namespace.clone(); + + let external_target = self + .scrape_target + .build_scrape_target() + .expect("failed to build external scrape target"); + + //TODO this may need to modified to include a scrapeConfigSelector label from the + //prometheus operator + let labels = external_target.labels; + + let scrape_target = ScrapeConfig { + metadata: ObjectMeta { + name: Some(name.clone()), + namespace: namespace.clone(), + ..Default::default() + }, + spec: ScrapeConfigSpec { + static_configs: Some(vec![StaticConfig { + targets: vec![format!("{}:{}", external_target.ip, external_target.port)], + labels, + }]), + metrics_path: external_target.path, + scrape_interval: external_target.interval, + job_name: Some(name), + ..Default::default() + }, + }; + + K8sResourceScore::single(scrape_target, namespace).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/mod.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/mod.rs new file mode 100644 index 00000000..0842d910 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/mod.rs @@ -0,0 +1,8 @@ +pub mod rhob_alertmanager_config; +pub mod rhob_alertmanagers; +pub mod rhob_default_rules; +pub mod rhob_monitoring_stack; +pub mod rhob_prometheus_rules; +pub mod rhob_prometheuses; +pub mod rhob_role; +pub mod rhob_service_monitor; diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanager_config.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanager_config.rs new file mode 100644 index 00000000..0d56dda1 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanager_config.rs @@ -0,0 +1,17 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema, Default)] +#[kube( + group = "monitoring.rhobs", + version = "v1alpha1", + kind = "AlertmanagerConfig", + plural = "alertmanagerconfigs", + namespaced, + derive = "Default" +)] +pub struct AlertmanagerConfigSpec { + #[serde(flatten)] + pub data: serde_json::Value, +} diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanagers.rs similarity index 93% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanagers.rs index 44354673..22c3386a 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_alertmanagers.rs +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_alertmanagers.rs @@ -2,7 +2,7 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::crd_prometheuses::LabelSelector; +use crate::modules::monitoring::red_hat_cluster_observability::crd::rhob_prometheuses::LabelSelector; /// Rust CRD for `Alertmanager` from Prometheus Operator #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_default_rules.rs similarity index 86% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_default_rules.rs index 459bd3fb..5adea7e4 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_default_rules.rs +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_default_rules.rs @@ -1,11 +1,11 @@ -use crate::modules::{ - monitoring::kube_prometheus::crd::rhob_prometheus_rules::Rule, - prometheus::alerts::k8s::{ +use crate::modules::monitoring::{ + alert_rule::alerts::k8s::{ deployment::alert_deployment_unavailable, pod::{alert_container_restarting, alert_pod_not_ready, pod_failed}, pvc::high_pvc_fill_rate_over_two_days, service::alert_service_down, }, + red_hat_cluster_observability::crd::rhob_prometheus_rules::Rule, }; pub fn build_default_application_rules() -> Vec { diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_monitoring_stack.rs similarity index 89% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_monitoring_stack.rs index be9ccc0f..99e72460 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_monitoring_stack.rs +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_monitoring_stack.rs @@ -2,7 +2,7 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector; +use crate::modules::monitoring::red_hat_cluster_observability::crd::rhob_prometheuses::LabelSelector; /// MonitoringStack CRD for monitoring.rhobs/v1alpha1 #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] @@ -11,7 +11,8 @@ use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSe version = "v1alpha1", kind = "MonitoringStack", plural = "monitoringstacks", - namespaced + namespaced, + derive = "Default" )] #[serde(rename_all = "camelCase")] pub struct MonitoringStackSpec { diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_prometheus_rules.rs similarity index 100% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheus_rules.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_prometheus_rules.rs diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_prometheuses.rs similarity index 96% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_prometheuses.rs index 18d3f57f..b93ecc76 100644 --- a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_prometheuses.rs +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_prometheuses.rs @@ -4,8 +4,6 @@ use kube::CustomResource; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::modules::monitoring::kube_prometheus::types::Operator; - #[derive(CustomResource, Serialize, Deserialize, Debug, Clone, JsonSchema)] #[kube( group = "monitoring.rhobs", @@ -93,6 +91,14 @@ pub struct LabelSelectorRequirement { pub values: Vec, } +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum Operator { + In, + NotIn, + Exists, + DoesNotExist, +} + impl Default for PrometheusSpec { fn default() -> Self { PrometheusSpec { diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_role.rs similarity index 100% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_role.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_role.rs diff --git a/harmony/src/modules/monitoring/kube_prometheus/crd/rhob_service_monitor.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_service_monitor.rs similarity index 100% rename from harmony/src/modules/monitoring/kube_prometheus/crd/rhob_service_monitor.rs rename to harmony/src/modules/monitoring/red_hat_cluster_observability/crd/rhob_service_monitor.rs diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/mod.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/mod.rs new file mode 100644 index 00000000..5c28b7a7 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/mod.rs @@ -0,0 +1,55 @@ +use serde::Serialize; + +use crate::{ + modules::monitoring::red_hat_cluster_observability::crd::rhob_prometheuses::LabelSelector, + topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget}, +}; + +pub mod crd; +pub mod redhat_cluster_observability; +pub mod score_alert_receiver; +pub mod score_coo_monitoring_stack; +pub mod score_redhat_cluster_observability_operator; + +/// RedHat Cluster Observability Operator allows fine grained control of monitoring stack +/// solutions. This can be run in parallel with the default openshift-monitoring stack. +/// https://www.redhat.com/en/blog/introducing-cluster-observability-operator +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservability { + ///namespace where the MonitoringStack CRD will be deployed + pub namespace: String, + /// Labels used to match resources the stack will monitor + pub resource_selector: Option, +} + +impl AlertSender for RedHatClusterObservability { + fn name(&self) -> String { + "RedHatClusterObs".to_string() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} + +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} +impl Serialize for Box> { + fn serialize(&self, _serializer: S) -> Result + where + S: serde::Serializer, + { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/redhat_cluster_observability.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/redhat_cluster_observability.rs new file mode 100644 index 00000000..c8417a58 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/redhat_cluster_observability.rs @@ -0,0 +1,37 @@ +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability, + score::Score, + topology::{ + Topology, + monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget}, + }, +}; + +//TODO untested +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityScore { + pub sender: RedHatClusterObservability, + pub receivers: Vec>>, + pub rules: Vec>>, + pub scrape_targets: Option>>>, +} + +impl> Score + for RedHatClusterObservabilityScore +{ + fn name(&self) -> String { + "RedHatClusterObservabilityScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(AlertingInterpret { + sender: self.sender.clone(), + receivers: self.receivers.clone(), + rules: self.rules.clone(), + scrape_targets: self.scrape_targets.clone(), + }) + } +} diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/score_alert_receiver.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_alert_receiver.rs new file mode 100644 index 00000000..2ec56d2f --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_alert_receiver.rs @@ -0,0 +1,128 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use k8s_openapi::api::core::v1::Secret; +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::red_hat_cluster_observability::{ + RedHatClusterObservability, + crd::rhob_alertmanager_config::{AlertmanagerConfig, AlertmanagerConfigSpec}, + }, + }, + score::Score, + topology::{ + K8sclient, Topology, + monitoring::{AlertReceiver, InstallOperation}, + }, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityReceiverScore { + pub sender: RedHatClusterObservability, + pub receiver: Box>, +} + +impl Score for RedHatClusterObservabilityReceiverScore { + fn name(&self) -> String { + "RedHatClusterObservabilityReceiverScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(RedHatClusterObservabilityReceiverInterpret { + sender: self.sender.clone(), + receiver: self.receiver.clone(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityReceiverInterpret { + sender: RedHatClusterObservability, + pub receiver: Box>, +} + +#[async_trait] +impl Interpret for RedHatClusterObservabilityReceiverInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let name = self.receiver.name(); + let namespace = self.sender.namespace.clone(); + + let client = topology.k8s_client().await?; + let install_plan = self.receiver.build()?; + + let install_operation = install_plan.install_operation.unwrap_or_default(); + + for operation in install_operation { + match operation { + InstallOperation::CreateSecret { name, data } => { + let secret = Secret { + metadata: ObjectMeta { + name: Some(name), + ..Default::default() + }, + string_data: Some(data), + type_: Some("Opaque".to_string()), + ..Default::default() + }; + + client.apply(&secret, Some(&namespace)).await?; + } + } + } + + let route = install_plan.route.unwrap(); + let route = serde_json::to_value(route) + .map_err(|e| InterpretError::new(e.to_string())) + .expect("failed to serialize alert route"); + let receiver = install_plan.receiver.unwrap(); + let data = serde_json::json!({ + "route": route, + "receivers": [receiver] + }); + + let spec = AlertmanagerConfigSpec { data }; + + let metadata = ObjectMeta { + name: Some(name), + namespace: Some(self.sender.namespace.clone()), + labels: Some(std::collections::BTreeMap::from([( + "alertmanagerConfig".to_string(), + "enabled".to_string(), + )])), + ..ObjectMeta::default() + }; + + let alert_manager_config = AlertmanagerConfig { metadata, spec }; + + K8sResourceScore::single(alert_manager_config, Some(self.sender.namespace.clone())) + .create_interpret() + .execute(inventory, topology) + .await + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("RedHatClusterObsReceiverInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/score_coo_monitoring_stack.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_coo_monitoring_stack.rs new file mode 100644 index 00000000..45caaf53 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_coo_monitoring_stack.rs @@ -0,0 +1,45 @@ +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + interpret::Interpret, + modules::{ + k8s::resource::K8sResourceScore, + monitoring::red_hat_cluster_observability::crd::{ + rhob_monitoring_stack::{MonitoringStack, MonitoringStackSpec}, + rhob_prometheuses::LabelSelector, + }, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityMonitoringStackScore { + pub namespace: String, + pub resource_selector: Option, +} + +impl Score for RedHatClusterObservabilityMonitoringStackScore { + fn name(&self) -> String { + "RedHatClusterObservabilityMonitoringStackScore".to_string() + } + + fn create_interpret(&self) -> Box> { + let metadata = ObjectMeta { + name: Some(format!("{}-monitoring", self.namespace.clone()).into()), + namespace: Some(self.namespace.clone()), + labels: Some([("monitoring-stack".into(), "true".into())].into()), + ..Default::default() + }; + let spec = MonitoringStackSpec { + log_level: Some("debug".into()), + retention: Some("1d".into()), + resource_selector: self.resource_selector.clone(), + }; + + let monitoring_stack = MonitoringStack { metadata, spec }; + + K8sResourceScore::single(monitoring_stack, Some(self.namespace.clone())).create_interpret() + } +} diff --git a/harmony/src/modules/monitoring/red_hat_cluster_observability/score_redhat_cluster_observability_operator.rs b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_redhat_cluster_observability_operator.rs new file mode 100644 index 00000000..2c529e78 --- /dev/null +++ b/harmony/src/modules/monitoring/red_hat_cluster_observability/score_redhat_cluster_observability_operator.rs @@ -0,0 +1,113 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use kube::api::ObjectMeta; +use serde::Serialize; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + modules::k8s::{ + apps::crd::{Subscription, SubscriptionSpec}, + resource::K8sResourceScore, + }, + score::Score, + topology::{K8sclient, Topology}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityOperatorScore { + pub namespace: String, + pub channel: String, + pub install_plan_approval: String, + pub source: String, + pub source_namespace: String, +} + +impl Default for RedHatClusterObservabilityOperatorScore { + fn default() -> Self { + Self { + namespace: "openshift-operators".to_string(), + channel: "stable".to_string(), + install_plan_approval: "Automatic".to_string(), + source: "community-operators".to_string(), + source_namespace: "openshift-marketplace".to_string(), + } + } +} + +impl RedHatClusterObservabilityOperatorScore { + pub fn new() -> Self { + Self { + ..Default::default() + } + } +} + +impl Score for RedHatClusterObservabilityOperatorScore { + fn name(&self) -> String { + "RedHatClusterObservabilityScore".to_string() + } + + fn create_interpret(&self) -> Box> { + Box::new(RedHatClusterObservabilityOperatorInterpret { + score: self.clone(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct RedHatClusterObservabilityOperatorInterpret { + score: RedHatClusterObservabilityOperatorScore, +} + +#[async_trait] +impl Interpret for RedHatClusterObservabilityOperatorInterpret { + async fn execute( + &self, + inventory: &Inventory, + topology: &T, + ) -> Result { + let metadata = ObjectMeta { + name: Some("observabiliy-operator".to_string()), + namespace: Some(self.score.namespace.clone()), + ..ObjectMeta::default() + }; + + let spec = SubscriptionSpec { + channel: Some("stable".to_string()), + config: None, + install_plan_approval: Some(self.score.install_plan_approval.clone()), + name: "observability-operator".to_string(), + source: self.score.source.clone(), + source_namespace: self.score.source_namespace.clone(), + starting_csv: None, + }; + + let subscription = Subscription { metadata, spec }; + K8sResourceScore::single(subscription, Some(self.score.namespace.clone())) + .create_interpret() + .execute(inventory, topology) + .await?; + + Ok(Outcome::success( + "Installed RedhatClusterObservabilityOperator".to_string(), + )) + } + + fn get_name(&self) -> InterpretName { + InterpretName::Custom("RedHatClusterObservabilityInterpret") + } + + fn get_version(&self) -> Version { + todo!() + } + + fn get_status(&self) -> InterpretStatus { + todo!() + } + + fn get_children(&self) -> Vec { + todo!() + } +} diff --git a/harmony/src/modules/monitoring/scrape_target/mod.rs b/harmony/src/modules/monitoring/scrape_target/mod.rs index 74f47ad3..fe31ce9d 100644 --- a/harmony/src/modules/monitoring/scrape_target/mod.rs +++ b/harmony/src/modules/monitoring/scrape_target/mod.rs @@ -1 +1,2 @@ +pub mod prometheus_node_exporter; pub mod server; diff --git a/harmony/src/modules/monitoring/scrape_target/prometheus_node_exporter.rs b/harmony/src/modules/monitoring/scrape_target/prometheus_node_exporter.rs new file mode 100644 index 00000000..aa734ed7 --- /dev/null +++ b/harmony/src/modules/monitoring/scrape_target/prometheus_node_exporter.rs @@ -0,0 +1,70 @@ +use std::{collections::BTreeMap, net::IpAddr}; + +use harmony_macros::ip; +use serde::Serialize; + +use crate::{ + interpret::InterpretError, + modules::monitoring::okd::OpenshiftClusterAlertSender, + topology::monitoring::{ExternalScrapeTarget, ScrapeTarget}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct PrometheusNodeExporter { + pub job_name: String, + pub metrics_path: String, + pub scrape_interval: Option, + pub scrape_timeout: Option, + pub listen_address: IpAddr, + pub port: i32, + pub labels: Option>, +} + +impl Default for PrometheusNodeExporter { + fn default() -> Self { + Self { + job_name: "node-exporter".into(), + metrics_path: "/metrics".into(), + scrape_interval: None, + scrape_timeout: None, + listen_address: ip!("192.168.1.1"), + port: 9100, + labels: None, + } + } +} + +impl PrometheusNodeExporter { + fn new(job_name: String, metrics_path: String, listen_address: IpAddr, port: i32) -> Self { + Self { + job_name: job_name, + metrics_path: metrics_path, + scrape_interval: None, + scrape_timeout: None, + listen_address: listen_address, + port: port, + labels: None, + } + } +} + +impl ScrapeTarget for PrometheusNodeExporter { + fn build_scrape_target(&self) -> Result { + let external_target = ExternalScrapeTarget { + ip: self.listen_address.clone(), + port: self.port.clone(), + interval: self.scrape_interval.clone(), + path: Some(self.metrics_path.clone()), + labels: self.labels.clone(), + }; + Ok(external_target) + } + + fn name(&self) -> String { + self.job_name.clone() + } + + fn clone_box(&self) -> Box> { + Box::new(self.clone()) + } +} diff --git a/harmony/src/modules/monitoring/scrape_target/server.rs b/harmony/src/modules/monitoring/scrape_target/server.rs index 178e9140..b153e27e 100644 --- a/harmony/src/modules/monitoring/scrape_target/server.rs +++ b/harmony/src/modules/monitoring/scrape_target/server.rs @@ -1,16 +1,11 @@ use std::net::IpAddr; -use async_trait::async_trait; -use kube::api::ObjectMeta; use serde::Serialize; use crate::{ - interpret::{InterpretError, Outcome}, - modules::monitoring::kube_prometheus::crd::{ - crd_alertmanager_config::CRDPrometheus, - crd_scrape_config::{Params, RelabelConfig, ScrapeConfig, ScrapeConfigSpec, StaticConfig}, - }, - topology::oberservability::monitoring::ScrapeTarget, + interpret::InterpretError, + modules::monitoring::prometheus::Prometheus, + topology::monitoring::{ExternalScrapeTarget, ScrapeTarget}, }; #[derive(Debug, Clone, Serialize)] @@ -22,59 +17,56 @@ pub struct Server { pub domain: String, } -#[async_trait] -impl ScrapeTarget for Server { - async fn install(&self, sender: &CRDPrometheus) -> Result { - let scrape_config_spec = ScrapeConfigSpec { - static_configs: Some(vec![StaticConfig { - targets: vec![self.ip.to_string()], - labels: None, - }]), - scrape_interval: Some("2m".to_string()), - kubernetes_sd_configs: None, - http_sd_configs: None, - file_sd_configs: None, - dns_sd_configs: None, - params: Some(Params { - auth: Some(vec![self.auth.clone()]), - module: Some(vec![self.module.clone()]), - }), - consul_sd_configs: None, - relabel_configs: Some(vec![RelabelConfig { - action: None, - source_labels: Some(vec!["__address__".to_string()]), - separator: None, - target_label: Some("__param_target".to_string()), - regex: None, - replacement: Some(format!("snmp.{}:31080", self.domain.clone())), - modulus: None, - }]), - metric_relabel_configs: None, - metrics_path: Some("/snmp".to_string()), - scrape_timeout: Some("2m".to_string()), - job_name: Some(format!("snmp_exporter/cloud/{}", self.name.clone())), - scheme: None, - }; - - let scrape_config = ScrapeConfig { - metadata: ObjectMeta { - name: Some(self.name.clone()), - namespace: Some(sender.namespace.clone()), - ..Default::default() - }, - spec: scrape_config_spec, - }; - sender - .client - .apply(&scrape_config, Some(&sender.namespace.clone())) - .await?; - Ok(Outcome::success(format!( - "installed scrape target {}", - self.name.clone() - ))) +impl ScrapeTarget for Server { + fn build_scrape_target(&self) -> Result { + todo!() + // let scrape_config_spec = ScrapeConfigSpec { + // static_configs: Some(vec![StaticConfig { + // targets: vec![self.ip.to_string()], + // labels: None, + // }]), + // scrape_interval: Some("2m".to_string()), + // kubernetes_sd_configs: None, + // http_sd_configs: None, + // file_sd_configs: None, + // dns_sd_configs: None, + // params: Some(Params { + // auth: Some(vec![self.auth.clone()]), + // module: Some(vec![self.module.clone()]), + // }), + // consul_sd_configs: None, + // relabel_configs: Some(vec![RelabelConfig { + // action: None, + // source_labels: Some(vec!["__address__".to_string()]), + // separator: None, + // target_label: Some("__param_target".to_string()), + // regex: None, + // replacement: Some(format!("snmp.{}:31080", self.domain.clone())), + // modulus: None, + // }]), + // metric_relabel_configs: None, + // metrics_path: Some("/snmp".to_string()), + // scrape_timeout: Some("2m".to_string()), + // job_name: Some(format!("snmp_exporter/cloud/{}", self.name.clone())), + // scheme: None, + // }; + // + // let scrape_config = ScrapeConfig { + // metadata: ObjectMeta { + // name: Some(self.name.clone()), + // namespace: Some(sender.namespace.clone()), + // ..Default::default() + // }, + // spec: scrape_config_spec, + // }; + // Ok(serde_json::to_value(scrape_config).map_err(|e| InterpretError::new(e.to_string()))?) } - fn clone_box(&self) -> Box> { + fn clone_box(&self) -> Box> { Box::new(self.clone()) } + + fn name(&self) -> String { + todo!() + } } diff --git a/harmony/src/modules/okd/load_balancer.rs b/harmony/src/modules/okd/load_balancer.rs index e9455581..7e2bd170 100644 --- a/harmony/src/modules/okd/load_balancer.rs +++ b/harmony/src/modules/okd/load_balancer.rs @@ -8,7 +8,7 @@ use crate::{ score::Score, topology::{ BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer, - LoadBalancerService, LogicalHost, Router, SSL, Topology, + LoadBalancerService, Router, SSL, Topology, }, }; @@ -152,7 +152,7 @@ mod tests { use std::sync::{Arc, OnceLock}; use super::*; - use crate::topology::DummyInfra; + use crate::topology::{DummyInfra, LogicalHost}; use harmony_macros::ip; use harmony_types::net::IpAddress; diff --git a/harmony/src/modules/postgresql/score_connect.rs b/harmony/src/modules/postgresql/score_connect.rs index 4a7eabff..e14d7a4e 100644 --- a/harmony/src/modules/postgresql/score_connect.rs +++ b/harmony/src/modules/postgresql/score_connect.rs @@ -303,7 +303,7 @@ impl Interpret for PostgreSQLConnectio let port = self.get_port(&app_data)?; // Create test script - let script_path = self.create_test_script(temp_dir_path)?; + let _ = self.create_test_script(temp_dir_path)?; let ca_file_in_container = Path::new("/tmp").join(ca_file.file_name().unwrap()); let script_cmd = format!( diff --git a/harmony/src/modules/postgresql/score_k8s.rs b/harmony/src/modules/postgresql/score_k8s.rs index 5c4b8342..59aa8337 100644 --- a/harmony/src/modules/postgresql/score_k8s.rs +++ b/harmony/src/modules/postgresql/score_k8s.rs @@ -1,5 +1,3 @@ -use std::collections::BTreeMap; - use serde::Serialize; use crate::interpret::Interpret; diff --git a/harmony/src/modules/prometheus/mod.rs b/harmony/src/modules/prometheus/mod.rs deleted file mode 100644 index c4f25ba2..00000000 --- a/harmony/src/modules/prometheus/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod alerts; -pub mod k8s_prometheus_alerting_score; -#[allow(clippy::module_inception)] -pub mod prometheus; -pub mod rhob_alerting_score; diff --git a/harmony/src/modules/prometheus/prometheus.rs b/harmony/src/modules/prometheus/prometheus.rs deleted file mode 100644 index efb89da8..00000000 --- a/harmony/src/modules/prometheus/prometheus.rs +++ /dev/null @@ -1,25 +0,0 @@ -use async_trait::async_trait; - -use crate::{ - inventory::Inventory, - topology::{ - PreparationError, PreparationOutcome, - oberservability::monitoring::{AlertReceiver, AlertSender}, - }, -}; - -#[async_trait] -pub trait PrometheusMonitoring { - async fn install_prometheus( - &self, - sender: &S, - inventory: &Inventory, - receivers: Option>>>, - ) -> Result; - - async fn ensure_prometheus_operator( - &self, - sender: &S, - inventory: &Inventory, - ) -> Result; -} diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs index 8baab66b..548f3745 100644 --- a/harmony_agent/deploy/src/main.rs +++ b/harmony_agent/deploy/src/main.rs @@ -1,12 +1,9 @@ use harmony::{ inventory::Inventory, - modules::{ - application::{ - ApplicationScore, - backend_app::{BackendApp, BuildCommand}, - features::{Monitoring, PackagingDeployment}, - }, - monitoring::alert_channel::discord_alert_channel::DiscordWebhook, + modules::application::{ + ApplicationScore, + backend_app::{BackendApp, BuildCommand}, + features::{Monitoring, PackagingDeployment}, }, topology::K8sAnywhereTopology, }; @@ -40,14 +37,14 @@ async fn main() { Box::new(PackagingDeployment { application: application.clone(), }), - Box::new(Monitoring { - application: application.clone(), - alert_receiver: vec![Box::new(DiscordWebhook { - name: K8sName("test-discord".to_string()), - url: hurl!("https://discord.doesnt.exist.com"), - selectors: vec![], - })], - }), + // Box::new(Monitoring { + // application: application.clone(), + // alert_receiver: vec![Box::new(DiscordWebhook { + // name: K8sName("test-discord".to_string()), + // url: hurl!("https://discord.doesnt.exist.com"), + // selectors: vec![], + // })], + // }), ], application, };