Compare commits
46 Commits
fix/monito
...
fix/refact
| Author | SHA1 | Date | |
|---|---|---|---|
| af6145afe3 | |||
| 701d86de69 | |||
| 6db7a780fa | |||
| 0df4e3cdee | |||
| 2a7fa466cc | |||
| f463cd1e94 | |||
| e1da7949ec | |||
| d0a1a73710 | |||
| bc2b328296 | |||
| a93896707f | |||
| 0e9b23a320 | |||
| f532ba2b40 | |||
| fafca31798 | |||
| 5412c34957 | |||
| 787cc8feab | |||
| ce041f495b | |||
| 55de206523 | |||
| 64893a84f5 | |||
| f941672662 | |||
| a98113dd40 | |||
| 5db1a31d33 | |||
| f5aac67af8 | |||
| d7e5bf11d5 | |||
| 2e1f1b8447 | |||
| 2b157ad7fd | |||
| a0c0905c3b | |||
| fe52f69473 | |||
| d8338ad12c | |||
| ac9fedf853 | |||
| fd3705e382 | |||
| 4840c7fdc2 | |||
| 20172a7801 | |||
| 6bb33c5845 | |||
| d9357adad3 | |||
| a25ca86bdf | |||
| 646c5e723e | |||
| 69c382e8c6 | |||
| dca764395d | |||
| 2738985edb | |||
| d9a21bf94b | |||
| 5c34d81d28 | |||
| 8f8bd34168 | |||
| b5e971b3b6 | |||
| a1c0e0e246 | |||
| d084cee8d5 | |||
| 63ef1c0ea7 |
2604
Cargo.lock
generated
2604
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,6 @@
|
||||
resolver = "2"
|
||||
members = [
|
||||
"private_repos/*",
|
||||
"examples/*",
|
||||
"harmony",
|
||||
"harmony_types",
|
||||
"harmony_macros",
|
||||
@@ -17,9 +16,9 @@ members = [
|
||||
"harmony_secret_derive",
|
||||
"harmony_secret",
|
||||
"adr/agent_discovery/mdns",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -38,6 +37,8 @@ tokio = { version = "1.40", features = [
|
||||
"macros",
|
||||
"rt-multi-thread",
|
||||
] }
|
||||
tokio-retry = "0.3.0"
|
||||
tokio-util = "0.7.15"
|
||||
cidr = { features = ["serde"], version = "0.2" }
|
||||
russh = "0.45"
|
||||
russh-keys = "0.45"
|
||||
|
||||
318
adr/020-monitoring-alerting-architecture.md
Normal file
318
adr/020-monitoring-alerting-architecture.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# Architecture Decision Record: Monitoring and Alerting Architecture
|
||||
|
||||
Initial Author: Willem Rolleman, Jean-Gabriel Carrier
|
||||
|
||||
Initial Date: March 9, 2026
|
||||
|
||||
Last Updated Date: March 9, 2026
|
||||
|
||||
## Status
|
||||
|
||||
Accepted
|
||||
|
||||
Supersedes: [ADR-010](010-monitoring-and-alerting.md)
|
||||
|
||||
## Context
|
||||
|
||||
Harmony needs a unified approach to monitoring and alerting across different infrastructure targets:
|
||||
|
||||
1. **Cluster-level monitoring**: Administrators managing entire Kubernetes/OKD clusters need to define cluster-wide alerts, receivers, and scrape targets.
|
||||
|
||||
2. **Tenant-level monitoring**: Multi-tenant clusters where teams are confined to namespaces need monitoring scoped to their resources.
|
||||
|
||||
3. **Application-level monitoring**: Developers deploying applications want zero-config monitoring that "just works" for their services.
|
||||
|
||||
The monitoring landscape is fragmented:
|
||||
- **OKD/OpenShift**: Built-in Prometheus with AlertmanagerConfig CRDs
|
||||
- **KubePrometheus**: Helm-based stack with PrometheusRule CRDs
|
||||
- **RHOB (Red Hat Observability)**: Operator-based with MonitoringStack CRDs
|
||||
- **Standalone Prometheus**: Raw Prometheus deployments
|
||||
|
||||
Each system has different CRDs, different installation methods, and different configuration APIs.
|
||||
|
||||
## Decision
|
||||
|
||||
We implement a **trait-based architecture with compile-time capability verification** that provides:
|
||||
|
||||
1. **Type-safe abstractions** via parameterized traits: `AlertReceiver<S>`, `AlertRule<S>`, `ScrapeTarget<S>`
|
||||
2. **Compile-time topology compatibility** via the `Observability<S>` capability bound
|
||||
3. **Three levels of abstraction**: Cluster, Tenant, and Application monitoring
|
||||
4. **Pre-built alert rules** as functions that return typed structs
|
||||
|
||||
### Core Traits
|
||||
|
||||
```rust
|
||||
// domain/topology/monitoring.rs
|
||||
|
||||
/// Marker trait for systems that send alerts (Prometheus, etc.)
|
||||
pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
}
|
||||
|
||||
/// Defines how a receiver (Discord, Slack, etc.) builds its configuration
|
||||
/// for a specific sender type
|
||||
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
|
||||
}
|
||||
|
||||
/// Defines how an alert rule builds its PrometheusRule configuration
|
||||
pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
fn build_rule(&self) -> Result<serde_json::Value, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
fn clone_box(&self) -> Box<dyn AlertRule<S>>;
|
||||
}
|
||||
|
||||
/// Capability that topologies implement to support monitoring
|
||||
pub trait Observability<S: AlertSender> {
|
||||
async fn install_alert_sender(&self, sender: &S, inventory: &Inventory)
|
||||
-> Result<PreparationOutcome, PreparationError>;
|
||||
async fn install_receivers(&self, sender: &S, inventory: &Inventory,
|
||||
receivers: Option<Vec<Box<dyn AlertReceiver<S>>>>) -> Result<...>;
|
||||
async fn install_rules(&self, sender: &S, inventory: &Inventory,
|
||||
rules: Option<Vec<Box<dyn AlertRule<S>>>>) -> Result<...>;
|
||||
async fn add_scrape_targets(&self, sender: &S, inventory: &Inventory,
|
||||
scrape_targets: Option<Vec<Box<dyn ScrapeTarget<S>>>>) -> Result<...>;
|
||||
async fn ensure_monitoring_installed(&self, sender: &S, inventory: &Inventory)
|
||||
-> Result<...>;
|
||||
}
|
||||
```
|
||||
|
||||
### Alert Sender Types
|
||||
|
||||
Each monitoring stack is a distinct `AlertSender`:
|
||||
|
||||
| Sender | Module | Use Case |
|
||||
|--------|--------|----------|
|
||||
| `OpenshiftClusterAlertSender` | `monitoring/okd/` | OKD/OpenShift built-in monitoring |
|
||||
| `KubePrometheus` | `monitoring/kube_prometheus/` | Helm-deployed kube-prometheus-stack |
|
||||
| `Prometheus` | `monitoring/prometheus/` | Standalone Prometheus via Helm |
|
||||
| `RedHatClusterObservability` | `monitoring/red_hat_cluster_observability/` | RHOB operator |
|
||||
| `Grafana` | `monitoring/grafana/` | Grafana-managed alerting |
|
||||
|
||||
### Three Levels of Monitoring
|
||||
|
||||
#### 1. Cluster-Level Monitoring
|
||||
|
||||
For cluster administrators. Full control over monitoring infrastructure.
|
||||
|
||||
```rust
|
||||
// examples/okd_cluster_alerts/src/main.rs
|
||||
OpenshiftClusterAlertScore {
|
||||
sender: OpenshiftClusterAlertSender,
|
||||
receivers: vec![Box::new(DiscordReceiver { ... })],
|
||||
rules: vec![Box::new(alert_rules)],
|
||||
scrape_targets: Some(vec![Box::new(external_exporters)]),
|
||||
}
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Cluster-scoped CRDs and resources
|
||||
- Can add external scrape targets (outside cluster)
|
||||
- Manages Alertmanager configuration
|
||||
- Requires cluster-admin privileges
|
||||
|
||||
#### 2. Tenant-Level Monitoring
|
||||
|
||||
For teams confined to namespaces. The topology determines tenant context.
|
||||
|
||||
```rust
|
||||
// The topology's Observability impl handles namespace scoping
|
||||
impl Observability<KubePrometheus> for K8sAnywhereTopology {
|
||||
async fn install_rules(&self, sender: &KubePrometheus, ...) {
|
||||
// Topology knows if it's tenant-scoped
|
||||
let namespace = self.get_tenant_config().await
|
||||
.map(|t| t.name)
|
||||
.unwrap_or("default");
|
||||
// Install rules in tenant namespace
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Namespace-scoped resources
|
||||
- Cannot modify cluster-level monitoring config
|
||||
- May have restricted receiver types
|
||||
- Runtime validation of permissions (cannot be fully compile-time)
|
||||
|
||||
#### 3. Application-Level Monitoring
|
||||
|
||||
For developers. Zero-config, opinionated monitoring.
|
||||
|
||||
```rust
|
||||
// modules/application/features/monitoring.rs
|
||||
pub struct Monitoring {
|
||||
pub application: Arc<dyn Application>,
|
||||
pub alert_receiver: Vec<Box<dyn AlertReceiver<Prometheus>>>,
|
||||
}
|
||||
|
||||
impl<T: Topology + Observability<Prometheus> + TenantManager + ...>
|
||||
ApplicationFeature<T> for Monitoring
|
||||
{
|
||||
async fn ensure_installed(&self, topology: &T) -> Result<...> {
|
||||
// Auto-creates ServiceMonitor
|
||||
// Auto-installs Ntfy for notifications
|
||||
// Handles tenant namespace automatically
|
||||
// Wires up sensible defaults
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Automatic ServiceMonitor creation
|
||||
- Opinionated notification channel (Ntfy)
|
||||
- Tenant-aware via topology
|
||||
- Minimal configuration required
|
||||
|
||||
## Rationale
|
||||
|
||||
### Why Generic Traits Instead of Unified Types?
|
||||
|
||||
Each monitoring stack (OKD, KubePrometheus, RHOB) has fundamentally different CRDs:
|
||||
|
||||
```rust
|
||||
// OKD uses AlertmanagerConfig with different structure
|
||||
AlertmanagerConfig { spec: { receivers: [...] } }
|
||||
|
||||
// RHOB uses secret references for webhook URLs
|
||||
MonitoringStack { spec: { alertmanagerConfig: { discordConfigs: [{ apiURL: { key: "..." } }] } } }
|
||||
|
||||
// KubePrometheus uses Alertmanager CRD with different field names
|
||||
Alertmanager { spec: { config: { receivers: [...] } } }
|
||||
```
|
||||
|
||||
A unified type would either:
|
||||
1. Be a lowest-common-denominator (loses stack-specific features)
|
||||
2. Be a complex union type (hard to use, easy to misconfigure)
|
||||
|
||||
Generic traits let each stack express its configuration naturally while providing a consistent interface.
|
||||
|
||||
### Why Compile-Time Capability Bounds?
|
||||
|
||||
```rust
|
||||
impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T>
|
||||
for OpenshiftClusterAlertScore { ... }
|
||||
```
|
||||
|
||||
This fails at compile time if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't support OKD monitoring. This prevents the "config-is-valid-but-platform-is-wrong" errors that Harmony was designed to eliminate.
|
||||
|
||||
### Why Not a MonitoringStack Abstraction (V2 Approach)?
|
||||
|
||||
The V2 approach proposed a unified `MonitoringStack` that hides sender selection:
|
||||
|
||||
```rust
|
||||
// V2 approach - rejected
|
||||
MonitoringStack::new(MonitoringApiVersion::V2CRD)
|
||||
.add_alert_channel(discord)
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
1. Hides which sender you're using, losing compile-time guarantees
|
||||
2. "Version selection" actually chooses between fundamentally different systems
|
||||
3. Would need to handle all stack-specific features through a generic interface
|
||||
|
||||
The current approach is explicit: you choose `OpenshiftClusterAlertSender` and the compiler verifies compatibility.
|
||||
|
||||
### Why Runtime Validation for Tenants?
|
||||
|
||||
Tenant confinement is determined at runtime by the topology and K8s RBAC. We cannot know at compile time whether a user has cluster-admin or namespace-only access.
|
||||
|
||||
Options considered:
|
||||
1. **Compile-time tenant markers** - Would require modeling entire RBAC hierarchy in types. Over-engineering.
|
||||
2. **Runtime validation** - Current approach. Fails with clear K8s permission errors if insufficient access.
|
||||
3. **No tenant support** - Would exclude a major use case.
|
||||
|
||||
Runtime validation is the pragmatic choice. The failure mode is clear (K8s API error) and occurs early in execution.
|
||||
|
||||
> Note : we will eventually have compile time validation for such things. Rust macros are powerful and we could discover the actual capabilities we're dealing with, similar to sqlx approach in query! macros.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Pros
|
||||
|
||||
1. **Type Safety**: Invalid configurations are caught at compile time
|
||||
2. **Extensibility**: Adding a new monitoring stack requires implementing traits, not modifying core code
|
||||
3. **Clear Separation**: Cluster/Tenant/Application levels have distinct entry points
|
||||
4. **Reusable Rules**: Pre-built alert rules as functions (`high_pvc_fill_rate_over_two_days()`)
|
||||
5. **CRD Accuracy**: Type definitions match actual Kubernetes CRDs exactly
|
||||
|
||||
### Cons
|
||||
|
||||
1. **Implementation Explosion**: `DiscordReceiver` implements `AlertReceiver<S>` for each sender type (3+ implementations)
|
||||
2. **Learning Curve**: Understanding the trait hierarchy takes time
|
||||
3. **clone_box Boilerplate**: Required for trait object cloning (3 lines per impl)
|
||||
|
||||
### Mitigations
|
||||
|
||||
- Implementation explosion is contained: each receiver type has O(senders) implementations, but receivers are rare compared to rules
|
||||
- Learning curve is documented with examples at each level
|
||||
- clone_box boilerplate is minimal and copy-paste
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Unified MonitoringStack Type
|
||||
|
||||
See "Why Not a MonitoringStack Abstraction" above. Rejected for losing compile-time safety.
|
||||
|
||||
### Helm-Only Approach
|
||||
|
||||
Use `HelmScore` directly for each monitoring deployment. Rejected because:
|
||||
- No type safety for alert rules
|
||||
- Cannot compose with application features
|
||||
- No tenant awareness
|
||||
|
||||
### Separate Modules Per Use Case
|
||||
|
||||
Have `cluster_monitoring/`, `tenant_monitoring/`, `app_monitoring/` as separate modules. Rejected because:
|
||||
- Massive code duplication
|
||||
- No shared abstraction for receivers/rules
|
||||
- Adding a feature requires three implementations
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
### Module Structure
|
||||
|
||||
```
|
||||
modules/monitoring/
|
||||
├── mod.rs # Public exports
|
||||
├── alert_channel/ # Receivers (Discord, Webhook)
|
||||
├── alert_rule/ # Rules and pre-built alerts
|
||||
│ ├── prometheus_alert_rule.rs
|
||||
│ └── alerts/ # Library of pre-built rules
|
||||
│ ├── k8s/ # K8s-specific (pvc, pod, memory)
|
||||
│ └── infra/ # Infrastructure (opnsense, dell)
|
||||
├── okd/ # OpenshiftClusterAlertSender
|
||||
├── kube_prometheus/ # KubePrometheus
|
||||
├── prometheus/ # Prometheus
|
||||
├── red_hat_cluster_observability/ # RHOB
|
||||
├── grafana/ # Grafana
|
||||
├── application_monitoring/ # Application-level scores
|
||||
└── scrape_target/ # External scrape targets
|
||||
```
|
||||
|
||||
### Adding a New Alert Sender
|
||||
|
||||
1. Create sender type: `pub struct MySender; impl AlertSender for MySender { ... }`
|
||||
2. Implement `Observability<MySender>` for topologies that support it
|
||||
3. Create CRD types in `crd/` subdirectory
|
||||
4. Implement `AlertReceiver<MySender>` for existing receivers
|
||||
5. Implement `AlertRule<MySender>` for `AlertManagerRuleGroup`
|
||||
|
||||
### Adding a New Alert Rule
|
||||
|
||||
```rust
|
||||
pub fn my_custom_alert() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new("MyAlert", "up == 0")
|
||||
.for_duration("5m")
|
||||
.label("severity", "critical")
|
||||
.annotation("summary", "Service is down")
|
||||
}
|
||||
```
|
||||
|
||||
No trait implementation needed - `AlertManagerRuleGroup` already handles conversion.
|
||||
|
||||
## Related ADRs
|
||||
|
||||
- [ADR-013](013-monitoring-notifications.md): Notification channel selection (ntfy)
|
||||
- [ADR-011](011-multi-tenant-cluster.md): Multi-tenant cluster architecture
|
||||
@@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "example-monitoring-v2"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
kube = { workspace = true }
|
||||
schemars = "0.8"
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
serde_yaml = { workspace = true }
|
||||
url = { workspace = true }
|
||||
log = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
k8s-openapi = { workspace = true }
|
||||
@@ -0,0 +1,91 @@
|
||||
# Monitoring v2 - Improved Architecture
|
||||
|
||||
This example demonstrates the improved monitoring architecture that addresses the "WTF/minute" issues in the original design.
|
||||
|
||||
## Key Improvements
|
||||
|
||||
### 1. **Single AlertChannel Trait with Generic Sender**
|
||||
|
||||
The original design required 9-12 implementations for each alert channel (Discord, Webhook, etc.) - one for each sender type. The new design uses a single trait with generic sender parameterization:
|
||||
|
||||
pub trait AlertChannel<Sender: AlertSender> {
|
||||
async fn install_config(&self, sender: &Sender) -> Result<Outcome, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
fn as_any(&self) -> &dyn std::any::Any;
|
||||
}
|
||||
|
||||
**Benefits:**
|
||||
- One Discord implementation works with all sender types
|
||||
- Type safety at compile time
|
||||
- No runtime dispatch overhead
|
||||
|
||||
### 2. **MonitoringStack Abstraction**
|
||||
|
||||
Instead of manually selecting CRDPrometheus vs KubePrometheus vs RHOBObservability, you now have a unified MonitoringStack that handles versioning:
|
||||
|
||||
let monitoring_stack = MonitoringStack::new(MonitoringApiVersion::V2CRD)
|
||||
.set_namespace("monitoring")
|
||||
.add_alert_channel(discord_receiver)
|
||||
.set_scrape_targets(vec![...]);
|
||||
|
||||
**Benefits:**
|
||||
- Single source of truth for monitoring configuration
|
||||
- Easy to switch between monitoring versions
|
||||
- Automatic version-specific configuration
|
||||
|
||||
### 3. **TenantMonitoringScore - True Composition**
|
||||
|
||||
The original monitoring_with_tenant example just put tenant and monitoring as separate items in a vec. The new design truly composes them:
|
||||
|
||||
let tenant_score = TenantMonitoringScore::new("test-tenant", monitoring_stack);
|
||||
|
||||
This creates a single score that:
|
||||
- Has tenant context
|
||||
- Has monitoring configuration
|
||||
- Automatically installs monitoring scoped to tenant namespace
|
||||
|
||||
**Benefits:**
|
||||
- No more "two separate things" confusion
|
||||
- Automatic tenant namespace scoping
|
||||
- Clear ownership: tenant owns its monitoring
|
||||
|
||||
### 4. **Versioned Monitoring APIs**
|
||||
|
||||
Clear versioning makes it obvious which monitoring stack you're using:
|
||||
|
||||
pub enum MonitoringApiVersion {
|
||||
V1Helm, // Old Helm charts
|
||||
V2CRD, // Current CRDs
|
||||
V3RHOB, // RHOB (future)
|
||||
}
|
||||
|
||||
**Benefits:**
|
||||
- No guessing which API version you're using
|
||||
- Easy to migrate between versions
|
||||
- Backward compatibility path
|
||||
|
||||
## Comparison
|
||||
|
||||
### Original Design (monitoring_with_tenant)
|
||||
- Manual selection of each component
|
||||
- Manual installation of both components
|
||||
- Need to remember to pass both to harmony_cli::run
|
||||
- Monitoring not scoped to tenant automatically
|
||||
|
||||
### New Design (monitoring_v2)
|
||||
- Single composed score
|
||||
- One score does it all
|
||||
|
||||
## Usage
|
||||
|
||||
cd examples/monitoring_v2
|
||||
cargo run
|
||||
|
||||
## Migration Path
|
||||
|
||||
To migrate from the old design to the new:
|
||||
|
||||
1. Replace individual alert channel implementations with AlertChannel<Sender>
|
||||
2. Use MonitoringStack instead of manual *Prometheus selection
|
||||
3. Use TenantMonitoringScore instead of separate TenantScore + monitoring scores
|
||||
4. Select monitoring version via MonitoringApiVersion
|
||||
@@ -0,0 +1,343 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
|
||||
use log::debug;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_yaml::{Mapping, Value};
|
||||
|
||||
use harmony::data::Version;
|
||||
use harmony::interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome};
|
||||
use harmony::inventory::Inventory;
|
||||
use harmony::score::Score;
|
||||
use harmony::topology::{Topology, tenant::TenantManager};
|
||||
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::k8s_name::K8sName;
|
||||
use harmony_types::net::Url;
|
||||
|
||||
pub trait AlertSender: Send + Sync + std::fmt::Debug {
|
||||
fn name(&self) -> String;
|
||||
fn namespace(&self) -> String;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CRDPrometheus {
|
||||
pub namespace: String,
|
||||
pub client: Arc<K8sClient>,
|
||||
}
|
||||
|
||||
impl AlertSender for CRDPrometheus {
|
||||
fn name(&self) -> String {
|
||||
"CRDPrometheus".to_string()
|
||||
}
|
||||
|
||||
fn namespace(&self) -> String {
|
||||
self.namespace.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RHOBObservability {
|
||||
pub namespace: String,
|
||||
pub client: Arc<K8sClient>,
|
||||
}
|
||||
|
||||
impl AlertSender for RHOBObservability {
|
||||
fn name(&self) -> String {
|
||||
"RHOBObservability".to_string()
|
||||
}
|
||||
|
||||
fn namespace(&self) -> String {
|
||||
self.namespace.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct KubePrometheus {
|
||||
pub config: Arc<Mutex<KubePrometheusConfig>>,
|
||||
}
|
||||
|
||||
impl Default for KubePrometheus {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl KubePrometheus {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertSender for KubePrometheus {
|
||||
fn name(&self) -> String {
|
||||
"KubePrometheus".to_string()
|
||||
}
|
||||
|
||||
fn namespace(&self) -> String {
|
||||
self.config.lock().unwrap().namespace.clone().unwrap_or_else(|| "monitoring".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct KubePrometheusConfig {
|
||||
pub namespace: Option<String>,
|
||||
#[serde(skip)]
|
||||
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
|
||||
}
|
||||
|
||||
impl KubePrometheusConfig {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
namespace: None,
|
||||
alert_receiver_configs: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AlertManagerChannelConfig {
|
||||
pub channel_receiver: serde_yaml::Value,
|
||||
pub channel_route: serde_yaml::Value,
|
||||
}
|
||||
|
||||
impl Default for AlertManagerChannelConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
channel_receiver: serde_yaml::Value::Mapping(Default::default()),
|
||||
channel_route: serde_yaml::Value::Mapping(Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ScrapeTargetConfig {
|
||||
pub service_name: String,
|
||||
pub port: String,
|
||||
pub path: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum MonitoringApiVersion {
|
||||
V1Helm,
|
||||
V2CRD,
|
||||
V3RHOB,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MonitoringStack {
|
||||
pub version: MonitoringApiVersion,
|
||||
pub namespace: String,
|
||||
pub alert_channels: Vec<Arc<dyn AlertSender>>,
|
||||
pub scrape_targets: Vec<ScrapeTargetConfig>,
|
||||
}
|
||||
|
||||
impl MonitoringStack {
|
||||
pub fn new(version: MonitoringApiVersion) -> Self {
|
||||
Self {
|
||||
version,
|
||||
namespace: "monitoring".to_string(),
|
||||
alert_channels: Vec::new(),
|
||||
scrape_targets: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_namespace(mut self, namespace: &str) -> Self {
|
||||
self.namespace = namespace.to_string();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_alert_channel(mut self, channel: impl AlertSender + 'static) -> Self {
|
||||
self.alert_channels.push(Arc::new(channel));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_scrape_targets(mut self, targets: Vec<(&str, &str, String)>) -> Self {
|
||||
self.scrape_targets = targets
|
||||
.into_iter()
|
||||
.map(|(name, port, path)| ScrapeTargetConfig {
|
||||
service_name: name.to_string(),
|
||||
port: port.to_string(),
|
||||
path,
|
||||
})
|
||||
.collect();
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AlertChannel<Sender: AlertSender> {
|
||||
fn install_config(&self, sender: &Sender);
|
||||
fn name(&self) -> String;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiscordWebhook {
|
||||
pub name: K8sName,
|
||||
pub url: Url,
|
||||
pub selectors: Vec<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
impl DiscordWebhook {
|
||||
fn get_config(&self) -> AlertManagerChannelConfig {
|
||||
let mut route = Mapping::new();
|
||||
route.insert(
|
||||
Value::String("receiver".to_string()),
|
||||
Value::String(self.name.to_string()),
|
||||
);
|
||||
route.insert(
|
||||
Value::String("matchers".to_string()),
|
||||
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
|
||||
);
|
||||
|
||||
let mut receiver = Mapping::new();
|
||||
receiver.insert(
|
||||
Value::String("name".to_string()),
|
||||
Value::String(self.name.to_string()),
|
||||
);
|
||||
|
||||
let mut discord_config = Mapping::new();
|
||||
discord_config.insert(
|
||||
Value::String("webhook_url".to_string()),
|
||||
Value::String(self.url.to_string()),
|
||||
);
|
||||
|
||||
receiver.insert(
|
||||
Value::String("discord_configs".to_string()),
|
||||
Value::Sequence(vec![Value::Mapping(discord_config)]),
|
||||
);
|
||||
|
||||
AlertManagerChannelConfig {
|
||||
channel_receiver: Value::Mapping(receiver),
|
||||
channel_route: Value::Mapping(route),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertChannel<CRDPrometheus> for DiscordWebhook {
|
||||
fn install_config(&self, sender: &CRDPrometheus) {
|
||||
debug!("Installing Discord webhook for CRDPrometheus in namespace: {}", sender.namespace());
|
||||
debug!("Config: {:?}", self.get_config());
|
||||
debug!("Installed!");
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"discord-webhook".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertChannel<RHOBObservability> for DiscordWebhook {
|
||||
fn install_config(&self, sender: &RHOBObservability) {
|
||||
debug!("Installing Discord webhook for RHOBObservability in namespace: {}", sender.namespace());
|
||||
debug!("Config: {:?}", self.get_config());
|
||||
debug!("Installed!");
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"webhook-receiver".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertChannel<KubePrometheus> for DiscordWebhook {
|
||||
fn install_config(&self, sender: &KubePrometheus) {
|
||||
debug!("Installing Discord webhook for KubePrometheus in namespace: {}", sender.namespace());
|
||||
let config = sender.config.lock().unwrap();
|
||||
let ns = config.namespace.clone().unwrap_or_else(|| "monitoring".to_string());
|
||||
debug!("Namespace: {}", ns);
|
||||
let mut config = sender.config.lock().unwrap();
|
||||
config.alert_receiver_configs.push(self.get_config());
|
||||
debug!("Installed!");
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"discord-webhook".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn default_monitoring_stack() -> MonitoringStack {
|
||||
MonitoringStack::new(MonitoringApiVersion::V2CRD)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TenantMonitoringScore {
|
||||
pub tenant_id: harmony_types::id::Id,
|
||||
pub tenant_name: String,
|
||||
#[serde(skip)]
|
||||
#[serde(default = "default_monitoring_stack")]
|
||||
pub monitoring_stack: MonitoringStack,
|
||||
}
|
||||
|
||||
impl TenantMonitoringScore {
|
||||
pub fn new(tenant_name: &str, monitoring_stack: MonitoringStack) -> Self {
|
||||
Self {
|
||||
tenant_id: harmony_types::id::Id::default(),
|
||||
tenant_name: tenant_name.to_string(),
|
||||
monitoring_stack,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + TenantManager> Score<T> for TenantMonitoringScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(TenantMonitoringInterpret {
|
||||
score: self.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
format!("{} monitoring [TenantMonitoringScore]", self.tenant_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TenantMonitoringInterpret {
|
||||
pub score: TenantMonitoringScore,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<T: Topology + TenantManager> Interpret<T> for TenantMonitoringInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
_inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let tenant_config = topology.get_tenant_config().await.unwrap();
|
||||
let tenant_ns = tenant_config.name.clone();
|
||||
|
||||
match self.score.monitoring_stack.version {
|
||||
MonitoringApiVersion::V1Helm => {
|
||||
debug!("Installing Helm monitoring for tenant {}", tenant_ns);
|
||||
}
|
||||
MonitoringApiVersion::V2CRD => {
|
||||
debug!("Installing CRD monitoring for tenant {}", tenant_ns);
|
||||
}
|
||||
MonitoringApiVersion::V3RHOB => {
|
||||
debug!("Installing RHOB monitoring for tenant {}", tenant_ns);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"Installed monitoring stack for tenant {} with version {:?}",
|
||||
self.score.tenant_name,
|
||||
self.score.monitoring_stack.version
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("TenantMonitoringInterpret")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
Version::from("1.0.0").unwrap()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
InterpretStatus::SUCCESS
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<harmony_types::id::Id> {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,7 @@
|
||||
use super::BrocadeClient;
|
||||
use crate::{
|
||||
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
|
||||
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
|
||||
shell::BrocadeShell,
|
||||
PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
@@ -8,7 +8,7 @@ use regex::Regex;
|
||||
use crate::{
|
||||
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
|
||||
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
|
||||
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
|
||||
parse_brocade_mac_address, shell::BrocadeShell,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -31,3 +31,16 @@ Ready to build your own components? These guides show you how.
|
||||
- [**Writing a Score**](./guides/writing-a-score.md): Learn how to create your own `Score` and `Interpret` logic to define a new desired state.
|
||||
- [**Writing a Topology**](./guides/writing-a-topology.md): Learn how to model a new environment (like AWS, GCP, or custom hardware) as a `Topology`.
|
||||
- [**Adding Capabilities**](./guides/adding-capabilities.md): See how to add a `Capability` to your custom `Topology`.
|
||||
- [**Coding Guide**](./coding-guide.md): Conventions and best practices for writing Harmony code.
|
||||
|
||||
## 5. Module Documentation
|
||||
|
||||
Deep dives into specific Harmony modules and features.
|
||||
|
||||
- [**Monitoring and Alerting**](./monitoring.md): Comprehensive guide to cluster, tenant, and application-level monitoring with support for OKD, KubePrometheus, RHOB, and more.
|
||||
|
||||
## 6. Architecture Decision Records
|
||||
|
||||
Important architectural decisions are documented in the `adr/` directory:
|
||||
|
||||
- [Full ADR Index](../adr/)
|
||||
|
||||
443
docs/monitoring.md
Normal file
443
docs/monitoring.md
Normal file
@@ -0,0 +1,443 @@
|
||||
# Monitoring and Alerting in Harmony
|
||||
|
||||
Harmony provides a unified, type-safe approach to monitoring and alerting across Kubernetes, OpenShift, and bare-metal infrastructure. This guide explains the architecture and how to use it at different levels of abstraction.
|
||||
|
||||
## Overview
|
||||
|
||||
Harmony's monitoring module supports three distinct use cases:
|
||||
|
||||
| Level | Who Uses It | What It Provides |
|
||||
|-------|-------------|------------------|
|
||||
| **Cluster** | Cluster administrators | Full control over monitoring stack, cluster-wide alerts, external scrape targets |
|
||||
| **Tenant** | Platform teams | Namespace-scoped monitoring in multi-tenant environments |
|
||||
| **Application** | Application developers | Zero-config monitoring that "just works" |
|
||||
|
||||
Each level builds on the same underlying abstractions, ensuring consistency while providing appropriate complexity for each audience.
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### AlertSender
|
||||
|
||||
An `AlertSender` represents the system that evaluates alert rules and sends notifications. Harmony supports multiple monitoring stacks:
|
||||
|
||||
| Sender | Description | Use When |
|
||||
|--------|-------------|----------|
|
||||
| `OpenshiftClusterAlertSender` | OKD/OpenShift built-in monitoring | Running on OKD/OpenShift |
|
||||
| `KubePrometheus` | kube-prometheus-stack via Helm | Standard Kubernetes, need full stack |
|
||||
| `Prometheus` | Standalone Prometheus | Custom Prometheus deployment |
|
||||
| `RedHatClusterObservability` | RHOB operator | Red Hat managed clusters |
|
||||
| `Grafana` | Grafana-managed alerting | Grafana as primary alerting layer |
|
||||
|
||||
### AlertReceiver
|
||||
|
||||
An `AlertReceiver` defines where alerts are sent (Discord, Slack, email, webhook, etc.). Receivers are parameterized by sender type because each monitoring stack has different configuration formats.
|
||||
|
||||
```rust
|
||||
pub trait AlertReceiver<S: AlertSender> {
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
}
|
||||
```
|
||||
|
||||
Built-in receivers:
|
||||
- `DiscordReceiver` - Discord webhooks
|
||||
- `WebhookReceiver` - Generic HTTP webhooks
|
||||
|
||||
### AlertRule
|
||||
|
||||
An `AlertRule` defines a Prometheus alert expression. Rules are also parameterized by sender to handle different CRD formats.
|
||||
|
||||
```rust
|
||||
pub trait AlertRule<S: AlertSender> {
|
||||
fn build_rule(&self) -> Result<serde_json::Value, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
}
|
||||
```
|
||||
|
||||
### Observability Capability
|
||||
|
||||
Topologies implement `Observability<S>` to indicate they support a specific alert sender:
|
||||
|
||||
```rust
|
||||
impl Observability<OpenshiftClusterAlertSender> for K8sAnywhereTopology {
|
||||
async fn install_receivers(&self, sender, inventory, receivers) { ... }
|
||||
async fn install_rules(&self, sender, inventory, rules) { ... }
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
This provides **compile-time verification**: if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't implement `Observability<OpenshiftClusterAlertSender>`, the code won't compile.
|
||||
|
||||
---
|
||||
|
||||
## Level 1: Cluster Monitoring
|
||||
|
||||
Cluster monitoring is for administrators who need full control over the monitoring infrastructure. This includes:
|
||||
- Installing/managing the monitoring stack
|
||||
- Configuring cluster-wide alert receivers
|
||||
- Defining cluster-level alert rules
|
||||
- Adding external scrape targets (e.g., bare-metal servers, firewalls)
|
||||
|
||||
### Example: OKD Cluster Alerts
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
modules::monitoring::{
|
||||
alert_channel::discord_alert_channel::DiscordReceiver,
|
||||
alert_rule::{alerts::k8s::pvc::high_pvc_fill_rate_over_two_days, prometheus_alert_rule::AlertManagerRuleGroup},
|
||||
okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
|
||||
scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
|
||||
},
|
||||
topology::{K8sAnywhereTopology, monitoring::{AlertMatcher, AlertRoute, MatchOp}},
|
||||
};
|
||||
|
||||
let severity_matcher = AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "critical".to_string(),
|
||||
};
|
||||
|
||||
let rule_group = AlertManagerRuleGroup::new(
|
||||
"cluster-rules",
|
||||
vec![high_pvc_fill_rate_over_two_days()],
|
||||
);
|
||||
|
||||
let external_exporter = PrometheusNodeExporter {
|
||||
job_name: "firewall".to_string(),
|
||||
metrics_path: "/metrics".to_string(),
|
||||
listen_address: ip!("192.168.1.1"),
|
||||
port: 9100,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(OpenshiftClusterAlertScore {
|
||||
sender: OpenshiftClusterAlertSender,
|
||||
receivers: vec![Box::new(DiscordReceiver {
|
||||
name: "critical-alerts".to_string(),
|
||||
url: hurl!("https://discord.com/api/webhooks/..."),
|
||||
route: AlertRoute {
|
||||
matchers: vec![severity_matcher],
|
||||
..AlertRoute::default("critical-alerts".to_string())
|
||||
},
|
||||
})],
|
||||
rules: vec![Box::new(rule_group)],
|
||||
scrape_targets: Some(vec![Box::new(external_exporter)]),
|
||||
})],
|
||||
None,
|
||||
).await?;
|
||||
```
|
||||
|
||||
### What This Does
|
||||
|
||||
1. **Enables cluster monitoring** - Activates OKD's built-in Prometheus
|
||||
2. **Enables user workload monitoring** - Allows namespace-scoped rules
|
||||
3. **Configures Alertmanager** - Adds Discord receiver with route matching
|
||||
4. **Deploys alert rules** - Creates `AlertingRule` CRD with PVC fill rate alert
|
||||
5. **Adds external scrape target** - Configures Prometheus to scrape the firewall
|
||||
|
||||
### Compile-Time Safety
|
||||
|
||||
The `OpenshiftClusterAlertScore` requires:
|
||||
|
||||
```rust
|
||||
impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T>
|
||||
for OpenshiftClusterAlertScore
|
||||
```
|
||||
|
||||
If `K8sAnywhereTopology` didn't implement `Observability<OpenshiftClusterAlertSender>`, this code would fail to compile. You cannot accidentally deploy OKD alerts to a cluster that doesn't support them.
|
||||
|
||||
---
|
||||
|
||||
## Level 2: Tenant Monitoring
|
||||
|
||||
In multi-tenant clusters, teams are often confined to specific namespaces. Tenant monitoring adapts to this constraint:
|
||||
|
||||
- Resources are deployed in the tenant's namespace
|
||||
- Cannot modify cluster-level monitoring configuration
|
||||
- The topology determines namespace context at runtime
|
||||
|
||||
### How It Works
|
||||
|
||||
The topology's `Observability` implementation handles tenant scoping:
|
||||
|
||||
```rust
|
||||
impl Observability<KubePrometheus> for K8sAnywhereTopology {
|
||||
async fn install_rules(&self, sender, inventory, rules) {
|
||||
// Topology knows if it's tenant-scoped
|
||||
let namespace = self.get_tenant_config().await
|
||||
.map(|t| t.name)
|
||||
.unwrap_or_else(|| "monitoring".to_string());
|
||||
|
||||
// Rules are installed in the appropriate namespace
|
||||
for rule in rules.unwrap_or_default() {
|
||||
let score = KubePrometheusRuleScore {
|
||||
sender: sender.clone(),
|
||||
rule,
|
||||
namespace: namespace.clone(), // Tenant namespace
|
||||
};
|
||||
score.create_interpret().execute(inventory, self).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Tenant vs Cluster Resources
|
||||
|
||||
| Resource | Cluster-Level | Tenant-Level |
|
||||
|----------|---------------|--------------|
|
||||
| Alertmanager config | Global receivers | Namespaced receivers (where supported) |
|
||||
| PrometheusRules | Cluster-wide alerts | Namespace alerts only |
|
||||
| ServiceMonitors | Any namespace | Own namespace only |
|
||||
| External scrape targets | Can add | Cannot add (cluster config) |
|
||||
|
||||
### Runtime Validation
|
||||
|
||||
Tenant constraints are validated at runtime via Kubernetes RBAC. If a tenant-scoped deployment attempts cluster-level operations, it fails with a clear permission error from the Kubernetes API.
|
||||
|
||||
This cannot be fully compile-time because tenant context is determined by who's running the code and what permissions they have—information only available at runtime.
|
||||
|
||||
---
|
||||
|
||||
## Level 3: Application Monitoring
|
||||
|
||||
Application monitoring provides zero-config, opinionated monitoring for developers. Just add the `Monitoring` feature to your application and it works.
|
||||
|
||||
### Example
|
||||
|
||||
```rust
|
||||
use harmony::modules::{
|
||||
application::{Application, ApplicationFeature},
|
||||
monitoring::alert_channel::webhook_receiver::WebhookReceiver,
|
||||
};
|
||||
|
||||
// Define your application
|
||||
let my_app = MyApplication::new();
|
||||
|
||||
// Add monitoring as a feature
|
||||
let monitoring = Monitoring {
|
||||
application: Arc::new(my_app),
|
||||
alert_receiver: vec![], // Uses defaults
|
||||
};
|
||||
|
||||
// Install with the application
|
||||
my_app.add_feature(monitoring);
|
||||
```
|
||||
|
||||
### What Application Monitoring Provides
|
||||
|
||||
1. **Automatic ServiceMonitor** - Creates a ServiceMonitor for your application's pods
|
||||
2. **Ntfy Notification Channel** - Auto-installs and configures Ntfy for push notifications
|
||||
3. **Tenant Awareness** - Automatically scopes to the correct namespace
|
||||
4. **Sensible Defaults** - Pre-configured alert routes and receivers
|
||||
|
||||
### Under the Hood
|
||||
|
||||
```rust
|
||||
impl<T: Topology + Observability<Prometheus> + TenantManager>
|
||||
ApplicationFeature<T> for Monitoring
|
||||
{
|
||||
async fn ensure_installed(&self, topology: &T) -> Result<...> {
|
||||
// 1. Get tenant namespace (or use app name)
|
||||
let namespace = topology.get_tenant_config().await
|
||||
.map(|ns| ns.name.clone())
|
||||
.unwrap_or_else(|| self.application.name());
|
||||
|
||||
// 2. Create ServiceMonitor for the app
|
||||
let app_service_monitor = ServiceMonitor {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.application.name()),
|
||||
namespace: Some(namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: ServiceMonitorSpec::default(),
|
||||
};
|
||||
|
||||
// 3. Install Ntfy for notifications
|
||||
let ntfy = NtfyScore { namespace, host };
|
||||
ntfy.interpret(&Inventory::empty(), topology).await?;
|
||||
|
||||
// 4. Wire up webhook receiver to Ntfy
|
||||
let ntfy_receiver = WebhookReceiver { ... };
|
||||
|
||||
// 5. Execute monitoring score
|
||||
alerting_score.interpret(&Inventory::empty(), topology).await?;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pre-Built Alert Rules
|
||||
|
||||
Harmony provides a library of common alert rules in `modules/monitoring/alert_rule/alerts/`:
|
||||
|
||||
### Kubernetes Alerts (`alerts/k8s/`)
|
||||
|
||||
```rust
|
||||
use harmony::modules::monitoring::alert_rule::alerts::k8s::{
|
||||
pod::pod_failed,
|
||||
pvc::high_pvc_fill_rate_over_two_days,
|
||||
memory_usage::alert_high_memory_usage,
|
||||
};
|
||||
|
||||
let rules = AlertManagerRuleGroup::new("k8s-rules", vec![
|
||||
pod_failed(),
|
||||
high_pvc_fill_rate_over_two_days(),
|
||||
alert_high_memory_usage(),
|
||||
]);
|
||||
```
|
||||
|
||||
Available rules:
|
||||
- `pod_failed()` - Pod in failed state
|
||||
- `alert_container_restarting()` - Container restart loop
|
||||
- `alert_pod_not_ready()` - Pod not ready for extended period
|
||||
- `high_pvc_fill_rate_over_two_days()` - PVC will fill within 2 days
|
||||
- `alert_high_memory_usage()` - Memory usage above threshold
|
||||
- `alert_high_cpu_usage()` - CPU usage above threshold
|
||||
|
||||
### Infrastructure Alerts (`alerts/infra/`)
|
||||
|
||||
```rust
|
||||
use harmony::modules::monitoring::alert_rule::alerts::infra::opnsense::high_http_error_rate;
|
||||
|
||||
let rules = AlertManagerRuleGroup::new("infra-rules", vec![
|
||||
high_http_error_rate(),
|
||||
]);
|
||||
```
|
||||
|
||||
### Creating Custom Rules
|
||||
|
||||
```rust
|
||||
use harmony::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
|
||||
|
||||
pub fn my_custom_alert() -> PrometheusAlertRule {
|
||||
PrometheusAlertRule::new("MyServiceDown", "up{job=\"my-service\"} == 0")
|
||||
.for_duration("5m")
|
||||
.label("severity", "critical")
|
||||
.annotation("summary", "My service is down")
|
||||
.annotation("description", "The my-service job has been down for more than 5 minutes")
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert Receivers
|
||||
|
||||
### Discord Webhook
|
||||
|
||||
```rust
|
||||
use harmony::modules::monitoring::alert_channel::discord_alert_channel::DiscordReceiver;
|
||||
use harmony::topology::monitoring::{AlertRoute, AlertMatcher, MatchOp};
|
||||
|
||||
let discord = DiscordReceiver {
|
||||
name: "ops-alerts".to_string(),
|
||||
url: hurl!("https://discord.com/api/webhooks/123456/abcdef"),
|
||||
route: AlertRoute {
|
||||
receiver: "ops-alerts".to_string(),
|
||||
matchers: vec![AlertMatcher {
|
||||
label: "severity".to_string(),
|
||||
operator: MatchOp::Eq,
|
||||
value: "critical".to_string(),
|
||||
}],
|
||||
group_by: vec!["alertname".to_string()],
|
||||
repeat_interval: Some("30m".to_string()),
|
||||
continue_matching: false,
|
||||
children: vec![],
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
### Generic Webhook
|
||||
|
||||
```rust
|
||||
use harmony::modules::monitoring::alert_channel::webhook_receiver::WebhookReceiver;
|
||||
|
||||
let webhook = WebhookReceiver {
|
||||
name: "custom-webhook".to_string(),
|
||||
url: hurl!("https://api.example.com/alerts"),
|
||||
route: AlertRoute::default("custom-webhook".to_string()),
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Adding a New Monitoring Stack
|
||||
|
||||
To add support for a new monitoring stack:
|
||||
|
||||
1. **Create the sender type** in `modules/monitoring/my_sender/mod.rs`:
|
||||
```rust
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MySender;
|
||||
|
||||
impl AlertSender for MySender {
|
||||
fn name(&self) -> String { "MySender".to_string() }
|
||||
}
|
||||
```
|
||||
|
||||
2. **Define CRD types** in `modules/monitoring/my_sender/crd/`:
|
||||
```rust
|
||||
#[derive(CustomResource, Debug, Serialize, Deserialize, Clone)]
|
||||
#[kube(group = "monitoring.example.com", version = "v1", kind = "MyAlertRule")]
|
||||
pub struct MyAlertRuleSpec { ... }
|
||||
```
|
||||
|
||||
3. **Implement Observability** in `domain/topology/k8s_anywhere/observability/my_sender.rs`:
|
||||
```rust
|
||||
impl Observability<MySender> for K8sAnywhereTopology {
|
||||
async fn install_receivers(&self, sender, inventory, receivers) { ... }
|
||||
async fn install_rules(&self, sender, inventory, rules) { ... }
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
4. **Implement receiver conversions** for existing receivers:
|
||||
```rust
|
||||
impl AlertReceiver<MySender> for DiscordReceiver {
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError> {
|
||||
// Convert DiscordReceiver to MySender's format
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
5. **Create score types**:
|
||||
```rust
|
||||
pub struct MySenderAlertScore {
|
||||
pub sender: MySender,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<MySender>>>,
|
||||
pub rules: Vec<Box<dyn AlertRule<MySender>>>,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Architecture Principles
|
||||
|
||||
### Type Safety Over Flexibility
|
||||
|
||||
Each monitoring stack has distinct CRDs and configuration formats. Rather than a unified "MonitoringStack" type that loses stack-specific features, we use generic traits that provide type safety while allowing each stack to express its unique configuration.
|
||||
|
||||
### Compile-Time Capability Verification
|
||||
|
||||
The `Observability<S>` bound ensures you can't deploy OKD alerts to a KubePrometheus cluster. The compiler catches platform mismatches before deployment.
|
||||
|
||||
### Explicit Over Implicit
|
||||
|
||||
Monitoring stacks are chosen explicitly (`OpenshiftClusterAlertSender` vs `KubePrometheus`). There's no "auto-detection" that could lead to surprising behavior.
|
||||
|
||||
### Three Levels, One Foundation
|
||||
|
||||
Cluster, tenant, and application monitoring all use the same traits (`AlertSender`, `AlertReceiver`, `AlertRule`). The difference is in how scores are constructed and how topologies interpret them.
|
||||
|
||||
---
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [ADR-020: Monitoring and Alerting Architecture](../adr/020-monitoring-alerting-architecture.md)
|
||||
- [ADR-013: Monitoring Notifications (ntfy)](../adr/013-monitoring-notifications.md)
|
||||
- [ADR-011: Multi-Tenant Cluster Architecture](../adr/011-multi-tenant-cluster.md)
|
||||
- [Coding Guide](coding-guide.md)
|
||||
- [Core Concepts](concepts.md)
|
||||
@@ -7,7 +7,7 @@ use harmony::{
|
||||
monitoring::alert_channel::webhook_receiver::WebhookReceiver,
|
||||
tenant::TenantScore,
|
||||
},
|
||||
topology::{K8sAnywhereTopology, tenant::TenantConfig},
|
||||
topology::{K8sAnywhereTopology, monitoring::AlertRoute, tenant::TenantConfig},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
use harmony_types::net::Url;
|
||||
@@ -33,9 +33,14 @@ async fn main() {
|
||||
service_port: 3000,
|
||||
});
|
||||
|
||||
let receiver_name = "sample-webhook-receiver".to_string();
|
||||
|
||||
let webhook_receiver = WebhookReceiver {
|
||||
name: "sample-webhook-receiver".to_string(),
|
||||
name: receiver_name.clone(),
|
||||
url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
|
||||
route: AlertRoute {
|
||||
..AlertRoute::default(receiver_name)
|
||||
},
|
||||
};
|
||||
|
||||
let app = ApplicationScore {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::cert_manager::{
|
||||
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
|
||||
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
|
||||
capability::CertificateManagementConfig, score_certificate::CertificateScore,
|
||||
score_issuer::CertificateIssuerScore,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
@@ -10,9 +10,10 @@ publish = false
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
cidr.workspace = true
|
||||
tokio.workspace = true
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use harmony::topology::k8s::{DrainOptions, K8sClient};
|
||||
use harmony_k8s::{DrainOptions, K8sClient};
|
||||
use log::{info, trace};
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
@@ -10,9 +10,10 @@ publish = false
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
cidr.workspace = true
|
||||
tokio.workspace = true
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use harmony::topology::k8s::{DrainOptions, K8sClient, NodeFile};
|
||||
use harmony_k8s::{K8sClient, NodeFile};
|
||||
use log::{info, trace};
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
16
examples/node_health/Cargo.toml
Normal file
16
examples/node_health/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "example-node-health"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio = { workspace = true }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log = { workspace = true }
|
||||
env_logger = { workspace = true }
|
||||
17
examples/node_health/src/main.rs
Normal file
17
examples/node_health/src/main.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
use harmony::{
|
||||
inventory::Inventory, modules::node_health::NodeHealthScore, topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let node_health = NodeHealthScore {};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(node_health)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -34,12 +34,12 @@ async fn main() {
|
||||
|
||||
let high_http_error_rate = high_http_error_rate();
|
||||
|
||||
let additional_rules = AlertManagerRuleGroup::new("", vec![high_http_error_rate]);
|
||||
let additional_rules = AlertManagerRuleGroup::new("test-rule", vec![high_http_error_rate]);
|
||||
|
||||
let scrape_target = PrometheusNodeExporter {
|
||||
job_name: "firewall".to_string(),
|
||||
metrics_path: "/metrics".to_string(),
|
||||
listen_address: ip!("127.0.0.1"),
|
||||
listen_address: ip!("192.168.1.1"),
|
||||
port: 9100,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -1,63 +1,13 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
|
||||
topology::K8sAnywhereTopology,
|
||||
inventory::Inventory, modules::openbao::OpenbaoScore, topology::K8sAnywhereTopology,
|
||||
};
|
||||
use harmony_macros::hurl;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let values_yaml = Some(
|
||||
r#"server:
|
||||
standalone:
|
||||
enabled: true
|
||||
config: |
|
||||
listener "tcp" {
|
||||
tls_disable = true
|
||||
address = "[::]:8200"
|
||||
cluster_address = "[::]:8201"
|
||||
}
|
||||
|
||||
storage "file" {
|
||||
path = "/openbao/data"
|
||||
}
|
||||
|
||||
service:
|
||||
enabled: true
|
||||
|
||||
dataStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null
|
||||
accessMode: ReadWriteOnce
|
||||
|
||||
auditStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null
|
||||
accessMode: ReadWriteOnce"#
|
||||
.to_string(),
|
||||
);
|
||||
let openbao = HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str("openbao").unwrap()),
|
||||
release_name: NonBlankString::from_str("openbao").unwrap(),
|
||||
chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
|
||||
chart_version: None,
|
||||
values_overrides: None,
|
||||
values_yaml,
|
||||
create_namespace: true,
|
||||
install_only: true,
|
||||
repository: Some(HelmRepository::new(
|
||||
"openbao".to_string(),
|
||||
hurl!("https://openbao.github.io/openbao-helm"),
|
||||
true,
|
||||
)),
|
||||
let openbao = OpenbaoScore {
|
||||
host: "openbao.sebastien.sto1.nationtech.io".to_string(),
|
||||
};
|
||||
|
||||
// TODO exec pod commands to initialize secret store if not already done
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
|
||||
@@ -9,7 +7,7 @@ use harmony::{
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
|
||||
let cnpg_operator = CloudNativePgOperatorScore::default();
|
||||
let cnpg_operator = CloudNativePgOperatorScore::default_openshift();
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
|
||||
@@ -1,22 +1,13 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv4Addr},
|
||||
sync::Arc,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use cidr::Ipv4Cidr;
|
||||
use harmony::{
|
||||
executors::ExecutorError,
|
||||
hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
|
||||
infra::opnsense::OPNSenseManagementInterface,
|
||||
inventory::Inventory,
|
||||
modules::opnsense::node_exporter::NodeExporterScore,
|
||||
topology::{
|
||||
HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
|
||||
UnmanagedRouter, node_exporter::NodeExporter,
|
||||
},
|
||||
topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
|
||||
};
|
||||
use harmony_macros::{ip, ipv4, mac_address};
|
||||
use harmony_macros::ip;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct OpnSenseTopology {
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{
|
||||
K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
|
||||
capability::PostgreSQLConfig,
|
||||
PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, path::PathBuf, sync::Arc};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
@@ -35,9 +35,14 @@ async fn main() {
|
||||
},
|
||||
};
|
||||
|
||||
let receiver_name = "sample-webhook-receiver".to_string();
|
||||
|
||||
let webhook_receiver = WebhookReceiver {
|
||||
name: "sample-webhook-receiver".to_string(),
|
||||
name: receiver_name.clone(),
|
||||
url: hurl!("https://webhook-doesnt-exist.com"),
|
||||
route: AlertRoute {
|
||||
..AlertRoute::default(receiver_name)
|
||||
},
|
||||
};
|
||||
|
||||
let app = ApplicationScore {
|
||||
|
||||
14
examples/zitadel/Cargo.toml
Normal file
14
examples/zitadel/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "example-zitadel"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio.workspace = true
|
||||
url.workspace = true
|
||||
20
examples/zitadel/src/main.rs
Normal file
20
examples/zitadel/src/main.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use harmony::{
|
||||
inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let zitadel = ZitadelScore {
|
||||
host: "sso.sto1.nationtech.io".to_string(),
|
||||
zitadel_version: "v4.12.1".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(zitadel)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
BIN
examples/zitadel/zitadel-9.24.0.tgz
Normal file
Binary file not shown.
23
harmony-k8s/Cargo.toml
Normal file
23
harmony-k8s/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "harmony-k8s"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
kube.workspace = true
|
||||
k8s-openapi.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-retry.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_yaml.workspace = true
|
||||
log.workspace = true
|
||||
similar.workspace = true
|
||||
reqwest.workspace = true
|
||||
url.workspace = true
|
||||
inquire.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions.workspace = true
|
||||
593
harmony-k8s/src/apply.rs
Normal file
593
harmony-k8s/src/apply.rs
Normal file
@@ -0,0 +1,593 @@
|
||||
use kube::{
|
||||
Client, Error, Resource,
|
||||
api::{
|
||||
Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
|
||||
ResourceExt,
|
||||
},
|
||||
core::ErrorResponse,
|
||||
discovery::Scope,
|
||||
error::DiscoveryError,
|
||||
};
|
||||
use log::{debug, error, trace, warn};
|
||||
use serde::{Serialize, de::DeserializeOwned};
|
||||
use serde_json::Value;
|
||||
use similar::TextDiff;
|
||||
use url::Url;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::helper;
|
||||
use crate::types::WriteMode;
|
||||
|
||||
/// The field-manager token sent with every server-side apply request.
|
||||
pub const FIELD_MANAGER: &str = "harmony-k8s";
|
||||
|
||||
// ── Private helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
|
||||
fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
|
||||
serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
|
||||
.map_err(Error::SerdeError)
|
||||
}
|
||||
|
||||
/// Fetch the current resource, display a unified diff against `payload`, and
|
||||
/// return `()`. All output goes to stdout (same behaviour as before).
|
||||
///
|
||||
/// A 404 is treated as "resource would be created" — not an error.
|
||||
async fn show_dry_run<T: Serialize>(
|
||||
api: &Api<DynamicObject>,
|
||||
name: &str,
|
||||
payload: &T,
|
||||
) -> Result<(), Error> {
|
||||
let new_yaml = serde_yaml::to_string(payload)
|
||||
.unwrap_or_else(|_| "Failed to serialize new resource".to_string());
|
||||
|
||||
match api.get(name).await {
|
||||
Ok(current) => {
|
||||
println!("\nDry-run for resource: '{name}'");
|
||||
let mut current_val = serde_yaml::to_value(¤t).unwrap_or(serde_yaml::Value::Null);
|
||||
if let Some(map) = current_val.as_mapping_mut() {
|
||||
map.remove(&serde_yaml::Value::String("status".to_string()));
|
||||
}
|
||||
let current_yaml = serde_yaml::to_string(¤t_val)
|
||||
.unwrap_or_else(|_| "Failed to serialize current resource".to_string());
|
||||
|
||||
if current_yaml == new_yaml {
|
||||
println!("No changes detected.");
|
||||
} else {
|
||||
println!("Changes detected:");
|
||||
let diff = TextDiff::from_lines(¤t_yaml, &new_yaml);
|
||||
for change in diff.iter_all_changes() {
|
||||
let sign = match change.tag() {
|
||||
similar::ChangeTag::Delete => "-",
|
||||
similar::ChangeTag::Insert => "+",
|
||||
similar::ChangeTag::Equal => " ",
|
||||
};
|
||||
print!("{sign}{change}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||
println!("\nDry-run for new resource: '{name}'");
|
||||
println!("Resource does not exist. Would be created:");
|
||||
for line in new_yaml.lines() {
|
||||
println!("+{line}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to fetch resource '{name}' for dry-run: {e}");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
|
||||
async fn do_apply<T: Serialize + std::fmt::Debug>(
|
||||
api: &Api<DynamicObject>,
|
||||
name: &str,
|
||||
payload: &T,
|
||||
patch_params: &PatchParams,
|
||||
write_mode: &WriteMode,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
match write_mode {
|
||||
WriteMode::CreateOrUpdate => {
|
||||
// TODO refactor this arm to perform self.update and if fail with 404 self.create
|
||||
// This will avoid the repetition of the api.patch and api.create calls within this
|
||||
// function body. This makes the code more maintainable
|
||||
match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||
Ok(obj) => Ok(obj),
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {
|
||||
debug!("Resource '{name}' not found via SSA, falling back to POST");
|
||||
let dyn_obj = to_dynamic(payload)?;
|
||||
api.create(&PostParams::default(), &dyn_obj)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Failed to create '{name}': {e}");
|
||||
e
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to apply '{name}': {e}");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
WriteMode::Create => {
|
||||
let dyn_obj = to_dynamic(payload)?;
|
||||
api.create(&PostParams::default(), &dyn_obj)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Failed to create '{name}': {e}");
|
||||
e
|
||||
})
|
||||
}
|
||||
WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
|
||||
Ok(obj) => Ok(obj),
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
|
||||
code: 404,
|
||||
message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
|
||||
reason: "NotFound".to_string(),
|
||||
status: "Failure".to_string(),
|
||||
})),
|
||||
Err(e) => {
|
||||
error!("Failed to update '{name}': {e}");
|
||||
Err(e)
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ── Public API ───────────────────────────────────────────────────────────────
|
||||
|
||||
impl K8sClient {
|
||||
/// Server-side apply: create if absent, update if present.
|
||||
/// Equivalent to `kubectl apply`.
|
||||
pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
|
||||
.await
|
||||
}
|
||||
|
||||
/// POST only — returns an error if the resource already exists.
|
||||
pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::Create)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Server-side apply only — returns an error if the resource does not exist.
|
||||
pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
self.apply_with_strategy(resource, namespace, WriteMode::Update)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn apply_with_strategy<K>(
|
||||
&self,
|
||||
resource: &K,
|
||||
namespace: Option<&str>,
|
||||
write_mode: WriteMode,
|
||||
) -> Result<K, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
debug!(
|
||||
"apply_with_strategy: {:?} ns={:?}",
|
||||
resource.meta().name,
|
||||
namespace
|
||||
);
|
||||
trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
|
||||
|
||||
let dyntype = K::DynamicType::default();
|
||||
let gvk = GroupVersionKind {
|
||||
group: K::group(&dyntype).to_string(),
|
||||
version: K::version(&dyntype).to_string(),
|
||||
kind: K::kind(&dyntype).to_string(),
|
||||
};
|
||||
|
||||
let discovery = self.discovery().await?;
|
||||
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Cannot resolve GVK: {gvk:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let effective_ns = if caps.scope == Scope::Cluster {
|
||||
None
|
||||
} else {
|
||||
namespace.or_else(|| resource.meta().namespace.as_deref())
|
||||
};
|
||||
|
||||
let api: Api<DynamicObject> =
|
||||
get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||
|
||||
let name = resource
|
||||
.meta()
|
||||
.name
|
||||
.as_deref()
|
||||
.expect("Kubernetes resource must have a name");
|
||||
|
||||
if self.dry_run {
|
||||
show_dry_run(&api, name, resource).await?;
|
||||
return Ok(resource.clone());
|
||||
}
|
||||
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
do_apply(&api, name, resource, &patch_params, &write_mode)
|
||||
.await
|
||||
.and_then(helper::dyn_to_typed)
|
||||
}
|
||||
|
||||
/// Applies resources in order, one at a time
|
||||
pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let mut result = Vec::new();
|
||||
for r in resources.iter() {
|
||||
let res = self.apply(r, ns).await;
|
||||
if res.is_err() {
|
||||
// NOTE: this may log sensitive data; downgrade to debug if needed.
|
||||
warn!(
|
||||
"Failed to apply k8s resource: {}",
|
||||
serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
|
||||
);
|
||||
}
|
||||
result.push(res?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Apply a [`DynamicObject`] resource using server-side apply.
|
||||
pub async fn apply_dynamic(
|
||||
&self,
|
||||
resource: &DynamicObject,
|
||||
namespace: Option<&str>,
|
||||
force_conflicts: bool,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
|
||||
|
||||
let discovery = self.discovery().await?;
|
||||
let type_meta = resource.types.as_ref().ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||
"DynamicObject must have types (apiVersion and kind)".to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||
"Invalid GVK in DynamicObject: {type_meta:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Cannot resolve GVK: {gvk:?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
let effective_ns = if caps.scope == Scope::Cluster {
|
||||
None
|
||||
} else {
|
||||
namespace.or_else(|| resource.metadata.namespace.as_deref())
|
||||
};
|
||||
|
||||
let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
|
||||
let name = resource.metadata.name.as_deref().ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(
|
||||
"DynamicObject must have metadata.name".to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
debug!(
|
||||
"apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
|
||||
resource.types.as_ref().map(|t| &t.kind),
|
||||
);
|
||||
|
||||
// NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
|
||||
// pasting the dry_run and some more logic
|
||||
if self.dry_run {
|
||||
show_dry_run(&api, name, resource).await?;
|
||||
return Ok(resource.clone());
|
||||
}
|
||||
|
||||
let mut patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
patch_params.force = force_conflicts;
|
||||
|
||||
do_apply(
|
||||
&api,
|
||||
name,
|
||||
resource,
|
||||
&patch_params,
|
||||
&WriteMode::CreateOrUpdate,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn apply_dynamic_many(
|
||||
&self,
|
||||
resources: &[DynamicObject],
|
||||
namespace: Option<&str>,
|
||||
force_conflicts: bool,
|
||||
) -> Result<Vec<DynamicObject>, Error> {
|
||||
let mut result = Vec::new();
|
||||
for r in resources.iter() {
|
||||
result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub async fn apply_yaml_many(
|
||||
&self,
|
||||
#[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
|
||||
ns: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
for y in yaml.iter() {
|
||||
self.apply_yaml(y, ns).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn apply_yaml(
|
||||
&self,
|
||||
yaml: &serde_yaml::Value,
|
||||
ns: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
// NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
|
||||
// apply_dynamic instead of reimplementing api interactions?
|
||||
let obj: DynamicObject =
|
||||
serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
|
||||
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
|
||||
|
||||
let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
|
||||
let kind = yaml["kind"].as_str().expect("missing kind");
|
||||
|
||||
let mut it = api_version.splitn(2, '/');
|
||||
let first = it.next().unwrap();
|
||||
let (g, v) = match it.next() {
|
||||
Some(second) => (first, second),
|
||||
None => ("", first),
|
||||
};
|
||||
|
||||
let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
|
||||
let namespace = ns.unwrap_or_else(|| {
|
||||
obj.metadata
|
||||
.namespace
|
||||
.as_deref()
|
||||
.expect("YAML must have a namespace when ns is not provided")
|
||||
});
|
||||
|
||||
let api: Api<DynamicObject> =
|
||||
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
|
||||
|
||||
println!("Applying '{name}' in namespace '{namespace}'...");
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
|
||||
println!("Successfully applied '{}'.", result.name_any());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Equivalent to `kubectl apply -f <url>`.
|
||||
pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
|
||||
let patch_params = PatchParams::apply(FIELD_MANAGER);
|
||||
let discovery = self.discovery().await?;
|
||||
|
||||
let yaml = reqwest::get(url)
|
||||
.await
|
||||
.expect("Could not fetch URL")
|
||||
.text()
|
||||
.await
|
||||
.expect("Could not read response body");
|
||||
|
||||
for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
|
||||
let obj: DynamicObject =
|
||||
serde_yaml::from_value(doc).expect("YAML document is not a valid object");
|
||||
let namespace = obj.metadata.namespace.as_deref().or(ns);
|
||||
let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
|
||||
let gvk =
|
||||
GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
|
||||
let name = obj.name_any();
|
||||
|
||||
if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
|
||||
let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
|
||||
trace!(
|
||||
"Applying {}:\n{}",
|
||||
gvk.kind,
|
||||
serde_yaml::to_string(&obj).unwrap_or_default()
|
||||
);
|
||||
let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
|
||||
let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
|
||||
debug!("Applied {} '{name}'", gvk.kind);
|
||||
} else {
|
||||
warn!("Skipping document with unknown GVK: {gvk:?}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
|
||||
pub(crate) fn get_api_for_dynamic_object(
|
||||
&self,
|
||||
object: &DynamicObject,
|
||||
ns: Option<&str>,
|
||||
) -> Result<Api<DynamicObject>, Error> {
|
||||
let ar = object
|
||||
.types
|
||||
.as_ref()
|
||||
.and_then(|t| {
|
||||
let parts: Vec<&str> = t.api_version.split('/').collect();
|
||||
match parts.as_slice() {
|
||||
[version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||
"", version, &t.kind,
|
||||
))),
|
||||
[group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
|
||||
group, version, &t.kind,
|
||||
))),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.ok_or_else(|| {
|
||||
Error::BuildRequest(kube::core::request::Error::Validation(format!(
|
||||
"Invalid apiVersion in DynamicObject: {object:#?}"
|
||||
)))
|
||||
})?;
|
||||
|
||||
Ok(match ns {
|
||||
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Free functions ───────────────────────────────────────────────────────────
|
||||
|
||||
pub(crate) fn get_dynamic_api(
|
||||
resource: kube::api::ApiResource,
|
||||
capabilities: kube::discovery::ApiCapabilities,
|
||||
client: Client,
|
||||
ns: Option<&str>,
|
||||
all: bool,
|
||||
) -> Api<DynamicObject> {
|
||||
if capabilities.scope == Scope::Cluster || all {
|
||||
Api::all_with(client, &resource)
|
||||
} else if let Some(namespace) = ns {
|
||||
Api::namespaced_with(client, namespace, &resource)
|
||||
} else {
|
||||
Api::default_namespaced_with(client, &resource)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn multidoc_deserialize(
|
||||
data: &str,
|
||||
) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
|
||||
use serde::Deserialize;
|
||||
let mut docs = vec![];
|
||||
for de in serde_yaml::Deserializer::from_str(data) {
|
||||
docs.push(serde_yaml::Value::deserialize(de)?);
|
||||
}
|
||||
Ok(docs)
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod apply_tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use kube::api::{DeleteParams, TypeMeta};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_creates_new_configmap() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-cm-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(client.apply(&cm, Some(ns)).await.is_ok());
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_is_idempotent() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-idem-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(
|
||||
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||
"first apply failed"
|
||||
);
|
||||
assert!(
|
||||
client.apply(&cm, Some(ns)).await.is_ok(),
|
||||
"second apply failed (not idempotent)"
|
||||
);
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "requires kubernetes cluster"]
|
||||
async fn apply_dynamic_creates_new_resource() {
|
||||
let client = K8sClient::try_default().await.unwrap();
|
||||
let ns = "default";
|
||||
let name = format!(
|
||||
"test-dyn-{}",
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis()
|
||||
);
|
||||
|
||||
let obj = DynamicObject {
|
||||
types: Some(TypeMeta {
|
||||
api_version: "v1".to_string(),
|
||||
kind: "ConfigMap".to_string(),
|
||||
}),
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: serde_json::json!({}),
|
||||
};
|
||||
|
||||
let result = client.apply_dynamic(&obj, Some(ns), false).await;
|
||||
assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
|
||||
|
||||
let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
|
||||
let _ = api.delete(&name, &DeleteParams::default()).await;
|
||||
}
|
||||
}
|
||||
@@ -25,9 +25,9 @@
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use harmony::topology::k8s::{K8sClient, helper};
|
||||
//! use harmony::topology::KubernetesDistribution;
|
||||
//! ```
|
||||
//! use harmony_k8s::{K8sClient, helper};
|
||||
//! use harmony_k8s::KubernetesDistribution;
|
||||
//!
|
||||
//! async fn write_network_config(client: &K8sClient, node: &str) {
|
||||
//! // Create a bundle with platform-specific RBAC
|
||||
@@ -56,7 +56,7 @@ use kube::{Error, Resource, ResourceExt, api::DynamicObject};
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
|
||||
use crate::domain::topology::k8s::K8sClient;
|
||||
use crate::K8sClient;
|
||||
|
||||
/// A ResourceBundle represents a logical unit of work consisting of multiple
|
||||
/// Kubernetes resources that should be applied or deleted together.
|
||||
99
harmony-k8s/src/client.rs
Normal file
99
harmony-k8s/src/client.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use kube::config::{KubeConfigOptions, Kubeconfig};
|
||||
use kube::{Client, Config, Discovery, Error};
|
||||
use log::error;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::OnceCell;
|
||||
|
||||
use crate::types::KubernetesDistribution;
|
||||
|
||||
// TODO not cool, should use a proper configuration mechanism
|
||||
// cli arg, env var, config file
|
||||
fn read_dry_run_from_env() -> bool {
|
||||
std::env::var("DRY_RUN")
|
||||
.map(|v| v == "true" || v == "1")
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct K8sClient {
|
||||
pub(crate) client: Client,
|
||||
/// When `true` no mutation is sent to the API server; diffs are printed
|
||||
/// to stdout instead. Initialised from the `DRY_RUN` environment variable.
|
||||
pub(crate) dry_run: bool,
|
||||
pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
|
||||
pub(crate) discovery: Arc<OnceCell<Discovery>>,
|
||||
}
|
||||
|
||||
impl Serialize for K8sClient {
|
||||
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
todo!("K8sClient serialization is not meaningful; remove this impl if unused")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for K8sClient {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_fmt(format_args!(
|
||||
"K8sClient {{ namespace: {}, dry_run: {} }}",
|
||||
self.client.default_namespace(),
|
||||
self.dry_run,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl K8sClient {
|
||||
/// Create a client, reading `DRY_RUN` from the environment.
|
||||
pub fn new(client: Client) -> Self {
|
||||
Self {
|
||||
dry_run: read_dry_run_from_env(),
|
||||
client,
|
||||
k8s_distribution: Arc::new(OnceCell::new()),
|
||||
discovery: Arc::new(OnceCell::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a client that always operates in dry-run mode, regardless of
|
||||
/// the environment variable.
|
||||
pub fn new_dry_run(client: Client) -> Self {
|
||||
Self {
|
||||
dry_run: true,
|
||||
..Self::new(client)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if this client is operating in dry-run mode.
|
||||
pub fn is_dry_run(&self) -> bool {
|
||||
self.dry_run
|
||||
}
|
||||
|
||||
pub async fn try_default() -> Result<Self, Error> {
|
||||
Ok(Self::new(Client::try_default().await?))
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig(path: &str) -> Option<Self> {
|
||||
Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
|
||||
let mut opts = KubeConfigOptions::default();
|
||||
opts.context = context;
|
||||
Self::from_kubeconfig_with_opts(path, &opts).await
|
||||
}
|
||||
|
||||
pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
|
||||
let k = match Kubeconfig::read_from(path) {
|
||||
Ok(k) => k,
|
||||
Err(e) => {
|
||||
error!("Failed to load kubeconfig from {path}: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(Self::new(
|
||||
Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
|
||||
))
|
||||
}
|
||||
}
|
||||
83
harmony-k8s/src/discovery.rs
Normal file
83
harmony-k8s/src/discovery.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use kube::{Discovery, Error};
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_retry::{Retry, strategy::ExponentialBackoff};
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::types::KubernetesDistribution;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn get_apiserver_version(
|
||||
&self,
|
||||
) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
|
||||
self.client.clone().apiserver_version().await
|
||||
}
|
||||
|
||||
/// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
|
||||
pub async fn discovery(&self) -> Result<&Discovery, Error> {
|
||||
let retry_strategy = ExponentialBackoff::from_millis(1000)
|
||||
.max_delay(Duration::from_secs(32))
|
||||
.take(6);
|
||||
|
||||
let attempt = Mutex::new(0u32);
|
||||
Retry::spawn(retry_strategy, || async {
|
||||
let mut n = attempt.lock().await;
|
||||
*n += 1;
|
||||
match self
|
||||
.discovery
|
||||
.get_or_try_init(async || {
|
||||
debug!("Running Kubernetes API discovery (attempt {})", *n);
|
||||
let d = Discovery::new(self.client.clone()).run().await?;
|
||||
debug!("Kubernetes API discovery completed");
|
||||
Ok(d)
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(d) => Ok(d),
|
||||
Err(e) => {
|
||||
warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("Kubernetes API discovery failed after all retries: {}", e);
|
||||
e
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect which Kubernetes distribution is running. Result is cached for
|
||||
/// the lifetime of the client.
|
||||
pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
|
||||
self.k8s_distribution
|
||||
.get_or_try_init(async || {
|
||||
debug!("Detecting Kubernetes distribution");
|
||||
let api_groups = self.client.list_api_groups().await?;
|
||||
trace!("list_api_groups: {:?}", api_groups);
|
||||
|
||||
let version = self.get_apiserver_version().await?;
|
||||
|
||||
if api_groups
|
||||
.groups
|
||||
.iter()
|
||||
.any(|g| g.name == "project.openshift.io")
|
||||
{
|
||||
info!("Detected distribution: OpenshiftFamily");
|
||||
return Ok(KubernetesDistribution::OpenshiftFamily);
|
||||
}
|
||||
|
||||
if version.git_version.contains("k3s") {
|
||||
info!("Detected distribution: K3sFamily");
|
||||
return Ok(KubernetesDistribution::K3sFamily);
|
||||
}
|
||||
|
||||
info!("Distribution not identified, using Default");
|
||||
Ok(KubernetesDistribution::Default)
|
||||
})
|
||||
.await
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::topology::KubernetesDistribution;
|
||||
use crate::KubernetesDistribution;
|
||||
|
||||
use super::bundle::ResourceBundle;
|
||||
use super::config::PRIVILEGED_POD_IMAGE;
|
||||
@@ -10,8 +10,10 @@ use k8s_openapi::api::core::v1::{
|
||||
};
|
||||
use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use kube::api::DynamicObject;
|
||||
use kube::error::DiscoveryError;
|
||||
use log::{debug, error, info, warn};
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PrivilegedPodConfig {
|
||||
@@ -131,9 +133,9 @@ pub fn host_root_volume() -> (Volume, VolumeMount) {
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use harmony::topology::k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
||||
/// # use harmony::topology::KubernetesDistribution;
|
||||
/// ```
|
||||
/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
|
||||
/// use harmony_k8s::KubernetesDistribution;
|
||||
/// let bundle = build_privileged_bundle(
|
||||
/// PrivilegedPodConfig {
|
||||
/// name: "network-setup".to_string(),
|
||||
@@ -279,6 +281,16 @@ pub fn prompt_drain_timeout_action(
|
||||
}
|
||||
}
|
||||
|
||||
/// JSON round-trip: DynamicObject → K
|
||||
///
|
||||
/// Safe because the DynamicObject was produced by the apiserver from a
|
||||
/// payload that was originally serialized from K, so the schema is identical.
|
||||
pub(crate) fn dyn_to_typed<K: DeserializeOwned>(obj: DynamicObject) -> Result<K, kube::Error> {
|
||||
serde_json::to_value(obj)
|
||||
.and_then(serde_json::from_value)
|
||||
.map_err(kube::Error::SerdeError)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
13
harmony-k8s/src/lib.rs
Normal file
13
harmony-k8s/src/lib.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
pub mod apply;
|
||||
pub mod bundle;
|
||||
pub mod client;
|
||||
pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod helper;
|
||||
pub mod node;
|
||||
pub mod pod;
|
||||
pub mod resources;
|
||||
pub mod types;
|
||||
|
||||
pub use client::K8sClient;
|
||||
pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};
|
||||
3
harmony-k8s/src/main.rs
Normal file
3
harmony-k8s/src/main.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
}
|
||||
722
harmony-k8s/src/node.rs
Normal file
722
harmony-k8s/src/node.rs
Normal file
@@ -0,0 +1,722 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use k8s_openapi::api::core::v1::{
|
||||
ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
|
||||
};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use kube::{
|
||||
Error,
|
||||
api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
|
||||
core::ErrorResponse,
|
||||
error::DiscoveryError,
|
||||
};
|
||||
use log::{debug, error, info, warn};
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::helper::{self, PrivilegedPodConfig};
|
||||
use crate::types::{DrainOptions, NodeFile};
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||
Api::<Node>::all(self.client.clone())
|
||||
.cordon(node_name)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
|
||||
Api::<Node>::all(self.client.clone())
|
||||
.uncordon(node_name)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
|
||||
self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn wait_for_node_ready_with_timeout(
|
||||
&self,
|
||||
node_name: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Node> = Api::all(self.client.clone());
|
||||
let start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
loop {
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not become Ready within {timeout:?}"
|
||||
))));
|
||||
}
|
||||
match api.get(node_name).await {
|
||||
Ok(node) => {
|
||||
if node
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|conds| {
|
||||
conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
{
|
||||
debug!("Node '{node_name}' is Ready");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||
}
|
||||
sleep(poll).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_node_not_ready(
|
||||
&self,
|
||||
node_name: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Node> = Api::all(self.client.clone());
|
||||
let start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
loop {
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not become NotReady within {timeout:?}"
|
||||
))));
|
||||
}
|
||||
match api.get(node_name).await {
|
||||
Ok(node) => {
|
||||
let is_ready = node
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|conds| {
|
||||
conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Ready" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false);
|
||||
if !is_ready {
|
||||
debug!("Node '{node_name}' is NotReady");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(e) => debug!("Error polling node '{node_name}': {e}"),
|
||||
}
|
||||
sleep(poll).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
|
||||
let api: Api<Pod> = Api::all(self.client.clone());
|
||||
Ok(api
|
||||
.list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
|
||||
.await?
|
||||
.items)
|
||||
}
|
||||
|
||||
fn is_mirror_pod(pod: &Pod) -> bool {
|
||||
pod.metadata
|
||||
.annotations
|
||||
.as_ref()
|
||||
.map(|a| a.contains_key("kubernetes.io/config.mirror"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_daemonset_pod(pod: &Pod) -> bool {
|
||||
pod.metadata
|
||||
.owner_references
|
||||
.as_ref()
|
||||
.map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn has_emptydir_volume(pod: &Pod) -> bool {
|
||||
pod.spec
|
||||
.as_ref()
|
||||
.and_then(|s| s.volumes.as_ref())
|
||||
.map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_completed_pod(pod: &Pod) -> bool {
|
||||
pod.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.phase.as_deref())
|
||||
.map(|phase| phase == "Succeeded" || phase == "Failed")
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn classify_pods_for_drain(
|
||||
pods: &[Pod],
|
||||
options: &DrainOptions,
|
||||
) -> Result<(Vec<Pod>, Vec<String>), String> {
|
||||
let mut evictable = Vec::new();
|
||||
let mut skipped = Vec::new();
|
||||
let mut blocking = Vec::new();
|
||||
|
||||
for pod in pods {
|
||||
let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
|
||||
let qualified = format!("{ns}/{name}");
|
||||
|
||||
if Self::is_mirror_pod(pod) {
|
||||
skipped.push(format!("{qualified} (mirror pod)"));
|
||||
continue;
|
||||
}
|
||||
if Self::is_completed_pod(pod) {
|
||||
skipped.push(format!("{qualified} (completed)"));
|
||||
continue;
|
||||
}
|
||||
if Self::is_daemonset_pod(pod) {
|
||||
if options.ignore_daemonsets {
|
||||
skipped.push(format!("{qualified} (DaemonSet-managed)"));
|
||||
} else {
|
||||
blocking.push(format!(
|
||||
"{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
|
||||
));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
|
||||
blocking.push(format!(
|
||||
"{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
|
||||
));
|
||||
continue;
|
||||
}
|
||||
evictable.push(pod.clone());
|
||||
}
|
||||
|
||||
if !blocking.is_empty() {
|
||||
return Err(format!(
|
||||
"Cannot drain node — the following pods block eviction:\n - {}",
|
||||
blocking.join("\n - ")
|
||||
));
|
||||
}
|
||||
Ok((evictable, skipped))
|
||||
}
|
||||
|
||||
async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
|
||||
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||
debug!("Evicting pod {ns}/{name}");
|
||||
Api::<Pod>::namespaced(self.client.clone(), ns)
|
||||
.evict(name, &EvictParams::default())
|
||||
.await
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// Drains a node: cordon → classify → evict & wait.
|
||||
pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
|
||||
debug!("Cordoning '{node_name}'");
|
||||
self.cordon_node(node_name).await?;
|
||||
|
||||
let pods = self.list_pods_on_node(node_name).await?;
|
||||
debug!("Found {} pod(s) on '{node_name}'", pods.len());
|
||||
|
||||
let (evictable, skipped) =
|
||||
Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
|
||||
error!("{msg}");
|
||||
Error::Discovery(DiscoveryError::MissingResource(msg))
|
||||
})?;
|
||||
|
||||
for s in &skipped {
|
||||
info!("Skipping pod: {s}");
|
||||
}
|
||||
if evictable.is_empty() {
|
||||
info!("No pods to evict on '{node_name}'");
|
||||
return Ok(());
|
||||
}
|
||||
info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
|
||||
|
||||
let mut start = tokio::time::Instant::now();
|
||||
let poll = Duration::from_secs(5);
|
||||
let mut pending = evictable;
|
||||
|
||||
loop {
|
||||
for pod in &pending {
|
||||
match self.evict_pod(pod).await {
|
||||
Ok(()) => {}
|
||||
Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
|
||||
Err(Error::Api(ErrorResponse { code: 429, .. })) => {
|
||||
warn!(
|
||||
"PDB blocked eviction of {}/{}; will retry",
|
||||
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||
pod.metadata.name.as_deref().unwrap_or("")
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to evict {}/{}: {e}",
|
||||
pod.metadata.namespace.as_deref().unwrap_or(""),
|
||||
pod.metadata.name.as_deref().unwrap_or("")
|
||||
);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sleep(poll).await;
|
||||
|
||||
let mut still_present = Vec::new();
|
||||
for pod in pending {
|
||||
let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
|
||||
let name = pod.metadata.name.as_deref().unwrap_or_default();
|
||||
match self.get_pod(name, Some(ns)).await? {
|
||||
Some(_) => still_present.push(pod),
|
||||
None => debug!("Pod {ns}/{name} evicted"),
|
||||
}
|
||||
}
|
||||
pending = still_present;
|
||||
|
||||
if pending.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
if start.elapsed() > options.timeout {
|
||||
match helper::prompt_drain_timeout_action(
|
||||
node_name,
|
||||
pending.len(),
|
||||
options.timeout,
|
||||
)? {
|
||||
helper::DrainTimeoutAction::Accept => break,
|
||||
helper::DrainTimeoutAction::Retry => {
|
||||
start = tokio::time::Instant::now();
|
||||
continue;
|
||||
}
|
||||
helper::DrainTimeoutAction::Abort => {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Drain aborted. {} pod(s) remaining on '{node_name}'",
|
||||
pending.len()
|
||||
))));
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
|
||||
}
|
||||
|
||||
debug!("'{node_name}' drained successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
|
||||
pub async fn reboot_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
drain_options: &DrainOptions,
|
||||
timeout: Duration,
|
||||
) -> Result<(), Error> {
|
||||
info!("Starting reboot for '{node_name}'");
|
||||
let node_api: Api<Node> = Api::all(self.client.clone());
|
||||
|
||||
let boot_id_before = node_api
|
||||
.get(node_name)
|
||||
.await?
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.node_info.as_ref())
|
||||
.map(|ni| ni.boot_id.clone())
|
||||
.ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' has no boot_id in status"
|
||||
)))
|
||||
})?;
|
||||
|
||||
info!("Draining '{node_name}'");
|
||||
self.drain_node(node_name, drain_options).await?;
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
|
||||
info!("Scheduling reboot for '{node_name}'");
|
||||
let reboot_cmd =
|
||||
"echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
|
||||
match self
|
||||
.run_privileged_command_on_node(node_name, reboot_cmd)
|
||||
.await
|
||||
{
|
||||
Ok(_) => debug!("Reboot command dispatched"),
|
||||
Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
|
||||
}
|
||||
|
||||
info!("Waiting for '{node_name}' to begin shutdown");
|
||||
self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
|
||||
.await?;
|
||||
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timeout during reboot of '{node_name}' (shutdown phase)"
|
||||
))));
|
||||
}
|
||||
|
||||
info!("Waiting for '{node_name}' to come back online");
|
||||
self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
|
||||
.await?;
|
||||
|
||||
if start.elapsed() > timeout {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timeout during reboot of '{node_name}' (ready phase)"
|
||||
))));
|
||||
}
|
||||
|
||||
let boot_id_after = node_api
|
||||
.get(node_name)
|
||||
.await?
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.node_info.as_ref())
|
||||
.map(|ni| ni.boot_id.clone())
|
||||
.ok_or_else(|| {
|
||||
Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' has no boot_id after reboot"
|
||||
)))
|
||||
})?;
|
||||
|
||||
if boot_id_before == boot_id_after {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
|
||||
))));
|
||||
}
|
||||
|
||||
info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
|
||||
self.uncordon_node(node_name).await?;
|
||||
info!("'{node_name}' reboot complete ({:?})", start.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a set of files to a node's filesystem via a privileged ephemeral pod.
|
||||
pub async fn write_files_to_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
files: &[NodeFile],
|
||||
) -> Result<String, Error> {
|
||||
let ns = self.client.default_namespace();
|
||||
let suffix = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let name = format!("harmony-k8s-writer-{suffix}");
|
||||
|
||||
debug!("Writing {} file(s) to '{node_name}'", files.len());
|
||||
|
||||
let mut data = BTreeMap::new();
|
||||
let mut script = String::from("set -e\n");
|
||||
for (i, file) in files.iter().enumerate() {
|
||||
let key = format!("f{i}");
|
||||
data.insert(key.clone(), file.content.clone());
|
||||
script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
|
||||
script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
|
||||
script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
|
||||
}
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(data),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
|
||||
cm_api.create(&PostParams::default(), &cm).await?;
|
||||
debug!("Created ConfigMap '{name}'");
|
||||
|
||||
let (host_vol, host_mount) = helper::host_root_volume();
|
||||
let payload_vol = Volume {
|
||||
name: "payload".to_string(),
|
||||
config_map: Some(ConfigMapVolumeSource {
|
||||
name: name.clone(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let payload_mount = VolumeMount {
|
||||
name: "payload".to_string(),
|
||||
mount_path: "/payload".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let bundle = helper::build_privileged_bundle(
|
||||
PrivilegedPodConfig {
|
||||
name: name.clone(),
|
||||
namespace: ns.to_string(),
|
||||
node_name: node_name.to_string(),
|
||||
container_name: "writer".to_string(),
|
||||
command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
|
||||
volumes: vec![payload_vol, host_vol],
|
||||
volume_mounts: vec![payload_mount, host_mount],
|
||||
host_pid: false,
|
||||
host_network: false,
|
||||
},
|
||||
&self.get_k8s_distribution().await?,
|
||||
);
|
||||
|
||||
bundle.apply(self).await?;
|
||||
debug!("Created privileged pod bundle '{name}'");
|
||||
|
||||
let result = self.wait_for_pod_completion(&name, ns).await;
|
||||
|
||||
debug!("Cleaning up '{name}'");
|
||||
let _ = bundle.delete(self).await;
|
||||
let _ = cm_api.delete(&name, &DeleteParams::default()).await;
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Run a privileged command on a node via an ephemeral pod.
|
||||
pub async fn run_privileged_command_on_node(
|
||||
&self,
|
||||
node_name: &str,
|
||||
command: &str,
|
||||
) -> Result<String, Error> {
|
||||
let namespace = self.client.default_namespace();
|
||||
let suffix = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
let name = format!("harmony-k8s-cmd-{suffix}");
|
||||
|
||||
debug!("Running privileged command on '{node_name}': {command}");
|
||||
|
||||
let (host_vol, host_mount) = helper::host_root_volume();
|
||||
let bundle = helper::build_privileged_bundle(
|
||||
PrivilegedPodConfig {
|
||||
name: name.clone(),
|
||||
namespace: namespace.to_string(),
|
||||
node_name: node_name.to_string(),
|
||||
container_name: "runner".to_string(),
|
||||
command: vec![
|
||||
"/bin/bash".to_string(),
|
||||
"-c".to_string(),
|
||||
command.to_string(),
|
||||
],
|
||||
volumes: vec![host_vol],
|
||||
volume_mounts: vec![host_mount],
|
||||
host_pid: true,
|
||||
host_network: true,
|
||||
},
|
||||
&self.get_k8s_distribution().await?,
|
||||
);
|
||||
|
||||
bundle.apply(self).await?;
|
||||
debug!("Privileged pod '{name}' created");
|
||||
|
||||
let result = self.wait_for_pod_completion(&name, namespace).await;
|
||||
|
||||
debug!("Cleaning up '{name}'");
|
||||
let _ = bundle.delete(self).await;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn base_pod(name: &str, ns: &str) -> Pod {
|
||||
Pod {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.to_string()),
|
||||
namespace: Some(ns.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(PodSpec::default()),
|
||||
status: Some(PodStatus {
|
||||
phase: Some("Running".to_string()),
|
||||
..Default::default()
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn mirror_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
|
||||
"kubernetes.io/config.mirror".to_string(),
|
||||
"abc123".to_string(),
|
||||
)]));
|
||||
pod
|
||||
}
|
||||
|
||||
fn daemonset_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.metadata.owner_references = Some(vec![OwnerReference {
|
||||
api_version: "apps/v1".to_string(),
|
||||
kind: "DaemonSet".to_string(),
|
||||
name: "some-ds".to_string(),
|
||||
uid: "uid-ds".to_string(),
|
||||
..Default::default()
|
||||
}]);
|
||||
pod
|
||||
}
|
||||
|
||||
fn emptydir_pod(name: &str, ns: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.spec = Some(PodSpec {
|
||||
volumes: Some(vec![Volume {
|
||||
name: "scratch".to_string(),
|
||||
empty_dir: Some(EmptyDirVolumeSource::default()),
|
||||
..Default::default()
|
||||
}]),
|
||||
..Default::default()
|
||||
});
|
||||
pod
|
||||
}
|
||||
|
||||
fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
|
||||
let mut pod = base_pod(name, ns);
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some(phase.to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
pod
|
||||
}
|
||||
|
||||
fn default_opts() -> DrainOptions {
|
||||
DrainOptions::default()
|
||||
}
|
||||
|
||||
// All test bodies are identical to the original — only the module path changed.
|
||||
|
||||
#[test]
|
||||
fn empty_pod_list_returns_empty_vecs() {
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normal_pod_is_evictable() {
|
||||
let pods = vec![base_pod("web", "default")];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert_eq!(e.len(), 1);
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mirror_pod_is_skipped() {
|
||||
let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("mirror pod"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn completed_pods_are_skipped() {
|
||||
for phase in ["Succeeded", "Failed"] {
|
||||
let pods = vec![completed_pod("job", "batch", phase)];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("completed"));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn daemonset_skipped_when_ignored() {
|
||||
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: true,
|
||||
..default_opts()
|
||||
};
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||
assert!(e.is_empty());
|
||||
assert!(s[0].contains("DaemonSet-managed"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn daemonset_blocks_when_not_ignored() {
|
||||
let pods = vec![daemonset_pod("fluentd", "logging")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emptydir_blocks_without_flag() {
|
||||
let pods = vec![emptydir_pod("cache", "default")];
|
||||
let opts = DrainOptions {
|
||||
delete_emptydir_data: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("emptyDir") && err.contains("default/cache"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emptydir_evictable_with_flag() {
|
||||
let pods = vec![emptydir_pod("cache", "default")];
|
||||
let opts = DrainOptions {
|
||||
delete_emptydir_data: true,
|
||||
..default_opts()
|
||||
};
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
|
||||
assert_eq!(e.len(), 1);
|
||||
assert!(s.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_blocking_all_reported() {
|
||||
let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
|
||||
let opts = DrainOptions {
|
||||
ignore_daemonsets: false,
|
||||
delete_emptydir_data: false,
|
||||
..default_opts()
|
||||
};
|
||||
let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
|
||||
assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_pods_classified_correctly() {
|
||||
let pods = vec![
|
||||
base_pod("web", "default"),
|
||||
mirror_pod("kube-apiserver", "kube-system"),
|
||||
daemonset_pod("fluentd", "logging"),
|
||||
completed_pod("job", "batch", "Succeeded"),
|
||||
base_pod("api", "default"),
|
||||
];
|
||||
let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
|
||||
let names: Vec<&str> = e
|
||||
.iter()
|
||||
.map(|p| p.metadata.name.as_deref().unwrap())
|
||||
.collect();
|
||||
assert_eq!(names, vec!["web", "api"]);
|
||||
assert_eq!(s.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mirror_checked_before_completed() {
|
||||
let mut pod = mirror_pod("static-etcd", "kube-system");
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some("Succeeded".to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||
assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn completed_checked_before_daemonset() {
|
||||
let mut pod = daemonset_pod("collector", "monitoring");
|
||||
pod.status = Some(PodStatus {
|
||||
phase: Some("Failed".to_string()),
|
||||
..Default::default()
|
||||
});
|
||||
let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
|
||||
assert!(s[0].contains("completed"), "got: {}", s[0]);
|
||||
}
|
||||
}
|
||||
193
harmony-k8s/src/pod.rs
Normal file
193
harmony-k8s/src/pod.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use k8s_openapi::api::core::v1::Pod;
|
||||
use kube::{
|
||||
Error,
|
||||
api::{Api, AttachParams, ListParams},
|
||||
error::DiscoveryError,
|
||||
runtime::reflector::Lookup,
|
||||
};
|
||||
use log::debug;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn wait_for_pod_ready(
|
||||
&self,
|
||||
pod_name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
let mut elapsed = 0u64;
|
||||
let interval = 5u64;
|
||||
let timeout_secs = 120u64;
|
||||
loop {
|
||||
if let Some(p) = self.get_pod(pod_name, namespace).await? {
|
||||
if let Some(phase) = p.status.and_then(|s| s.phase) {
|
||||
if phase.to_lowercase() == "running" {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
if elapsed >= timeout_secs {
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Pod '{}' in '{}' did not become ready within {timeout_secs}s",
|
||||
pod_name,
|
||||
namespace.unwrap_or("<default>"),
|
||||
))));
|
||||
}
|
||||
sleep(Duration::from_secs(interval)).await;
|
||||
elapsed += interval;
|
||||
}
|
||||
}
|
||||
|
||||
/// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
|
||||
/// logs. Used internally by node operations.
|
||||
pub(crate) async fn wait_for_pod_completion(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: &str,
|
||||
) -> Result<String, Error> {
|
||||
let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
|
||||
let poll_interval = Duration::from_secs(2);
|
||||
for _ in 0..60 {
|
||||
sleep(poll_interval).await;
|
||||
let p = api.get(name).await?;
|
||||
match p.status.and_then(|s| s.phase).as_deref() {
|
||||
Some("Succeeded") => {
|
||||
let logs = api
|
||||
.logs(name, &Default::default())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
|
||||
return Ok(logs);
|
||||
}
|
||||
Some("Failed") => {
|
||||
let logs = api
|
||||
.logs(name, &Default::default())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
debug!("Pod {namespace}/{name} failed. Logs: {logs}");
|
||||
return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Pod '{name}' failed.\n{logs}"
|
||||
))));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Err(Error::Discovery(DiscoveryError::MissingResource(format!(
|
||||
"Timed out waiting for pod '{name}'"
|
||||
))))
|
||||
}
|
||||
|
||||
/// Execute a command in the first pod matching `{label}={name}`.
|
||||
pub async fn exec_app_capture_output(
|
||||
&self,
|
||||
name: String,
|
||||
label: String,
|
||||
namespace: Option<&str>,
|
||||
command: Vec<&str>,
|
||||
) -> Result<String, String> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let pod_list = api
|
||||
.list(&ListParams::default().labels(&format!("{label}={name}")))
|
||||
.await
|
||||
.expect("Failed to list pods");
|
||||
|
||||
let pod_name = pod_list
|
||||
.items
|
||||
.first()
|
||||
.expect("No matching pod")
|
||||
.name()
|
||||
.expect("Pod has no name")
|
||||
.into_owned();
|
||||
|
||||
match api
|
||||
.exec(
|
||||
&pod_name,
|
||||
command,
|
||||
&AttachParams::default().stdout(true).stderr(true),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => Err(e.to_string()),
|
||||
Ok(mut process) => {
|
||||
let status = process
|
||||
.take_status()
|
||||
.expect("No status handle")
|
||||
.await
|
||||
.expect("Status channel closed");
|
||||
|
||||
if let Some(s) = status.status {
|
||||
let mut buf = String::new();
|
||||
if let Some(mut stdout) = process.stdout() {
|
||||
stdout
|
||||
.read_to_string(&mut buf)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to read stdout: {e}"))?;
|
||||
}
|
||||
debug!("exec status: {} - {:?}", s, status.details);
|
||||
if s == "Success" { Ok(buf) } else { Err(s) }
|
||||
} else {
|
||||
Err("No inner status from pod exec".to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute a command in the first pod matching
|
||||
/// `app.kubernetes.io/name={name}`.
|
||||
pub async fn exec_app(
|
||||
&self,
|
||||
name: String,
|
||||
namespace: Option<&str>,
|
||||
command: Vec<&str>,
|
||||
) -> Result<(), String> {
|
||||
let api: Api<Pod> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let pod_list = api
|
||||
.list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
|
||||
.await
|
||||
.expect("Failed to list pods");
|
||||
|
||||
let pod_name = pod_list
|
||||
.items
|
||||
.first()
|
||||
.expect("No matching pod")
|
||||
.name()
|
||||
.expect("Pod has no name")
|
||||
.into_owned();
|
||||
|
||||
match api.exec(&pod_name, command, &AttachParams::default()).await {
|
||||
Err(e) => Err(e.to_string()),
|
||||
Ok(mut process) => {
|
||||
let status = process
|
||||
.take_status()
|
||||
.expect("No status handle")
|
||||
.await
|
||||
.expect("Status channel closed");
|
||||
|
||||
if let Some(s) = status.status {
|
||||
debug!("exec status: {} - {:?}", s, status.details);
|
||||
if s == "Success" { Ok(()) } else { Err(s) }
|
||||
} else {
|
||||
Err("No inner status from pod exec".to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
316
harmony-k8s/src/resources.rs
Normal file
316
harmony-k8s/src/resources.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use k8s_openapi::api::{
|
||||
apps::v1::Deployment,
|
||||
core::v1::{Node, ServiceAccount},
|
||||
};
|
||||
use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
|
||||
use kube::api::ApiResource;
|
||||
use kube::{
|
||||
Error, Resource,
|
||||
api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
|
||||
runtime::conditions,
|
||||
runtime::wait::await_condition,
|
||||
};
|
||||
use log::debug;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::client::K8sClient;
|
||||
use crate::types::ScopeResolver;
|
||||
|
||||
impl K8sClient {
|
||||
pub async fn has_healthy_deployment_with_label(
|
||||
&self,
|
||||
namespace: &str,
|
||||
label_selector: &str,
|
||||
) -> Result<bool, Error> {
|
||||
let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
|
||||
let list = api
|
||||
.list(&ListParams::default().labels(label_selector))
|
||||
.await?;
|
||||
for d in list.items {
|
||||
let available = d
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.available_replicas)
|
||||
.unwrap_or(0);
|
||||
if available > 0 {
|
||||
return Ok(true);
|
||||
}
|
||||
if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
|
||||
if conds
|
||||
.iter()
|
||||
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||
{
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub async fn list_namespaces_with_healthy_deployments(
|
||||
&self,
|
||||
label_selector: &str,
|
||||
) -> Result<Vec<String>, Error> {
|
||||
let api: Api<Deployment> = Api::all(self.client.clone());
|
||||
let list = api
|
||||
.list(&ListParams::default().labels(label_selector))
|
||||
.await?;
|
||||
|
||||
let mut healthy_ns: HashMap<String, bool> = HashMap::new();
|
||||
for d in list.items {
|
||||
let ns = match d.metadata.namespace.clone() {
|
||||
Some(n) => n,
|
||||
None => continue,
|
||||
};
|
||||
let available = d
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.available_replicas)
|
||||
.unwrap_or(0);
|
||||
let is_healthy = if available > 0 {
|
||||
true
|
||||
} else {
|
||||
d.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.conditions.as_ref())
|
||||
.map(|c| {
|
||||
c.iter()
|
||||
.any(|c| c.type_ == "Available" && c.status == "True")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
};
|
||||
if is_healthy {
|
||||
healthy_ns.insert(ns, true);
|
||||
}
|
||||
}
|
||||
Ok(healthy_ns.into_keys().collect())
|
||||
}
|
||||
|
||||
pub async fn get_controller_service_account_name(
|
||||
&self,
|
||||
ns: &str,
|
||||
) -> Result<Option<String>, Error> {
|
||||
let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
|
||||
let list = api
|
||||
.list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
|
||||
.await?;
|
||||
if let Some(dep) = list.items.first() {
|
||||
if let Some(sa) = dep
|
||||
.spec
|
||||
.as_ref()
|
||||
.and_then(|s| s.template.spec.as_ref())
|
||||
.and_then(|s| s.service_account_name.clone())
|
||||
{
|
||||
return Ok(Some(sa));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
|
||||
let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
|
||||
let ar = ApiResource::from_gvk(&gvk);
|
||||
let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
|
||||
let list = api.list(&ListParams::default()).await?;
|
||||
Ok(list
|
||||
.items
|
||||
.into_iter()
|
||||
.map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
|
||||
let sa_user = format!("system:serviceaccount:{ns}:{sa}");
|
||||
for crb in self.list_clusterrolebindings_json().await? {
|
||||
if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
|
||||
for subj in subjects {
|
||||
let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
|
||||
let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
|
||||
|| (kind == "User" && name == sa_user)
|
||||
{
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
|
||||
let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
|
||||
let crds = api
|
||||
.list(&ListParams::default().fields(&format!("metadata.name={name}")))
|
||||
.await?;
|
||||
Ok(!crds.items.is_empty())
|
||||
}
|
||||
|
||||
pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
|
||||
Api::namespaced(self.client.clone(), namespace)
|
||||
}
|
||||
|
||||
pub async fn get_resource_json_value(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
gvk: &GroupVersionKind,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
let ar = ApiResource::from_gvk(gvk);
|
||||
let api: Api<DynamicObject> = match namespace {
|
||||
Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
|
||||
None => Api::default_namespaced_with(self.client.clone(), &ar),
|
||||
};
|
||||
api.get(name).await
|
||||
}
|
||||
|
||||
pub async fn get_secret_json_value(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<DynamicObject, Error> {
|
||||
self.get_resource_json_value(
|
||||
name,
|
||||
namespace,
|
||||
&GroupVersionKind {
|
||||
group: String::new(),
|
||||
version: "v1".to_string(),
|
||||
kind: "Secret".to_string(),
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn get_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<Option<Deployment>, Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => {
|
||||
debug!("Getting namespaced deployment '{name}' in '{ns}'");
|
||||
Api::namespaced(self.client.clone(), ns)
|
||||
}
|
||||
None => {
|
||||
debug!("Getting deployment '{name}' in default namespace");
|
||||
Api::default_namespaced(self.client.clone())
|
||||
}
|
||||
};
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn scale_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
replicas: u32,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
use kube::api::{Patch, PatchParams};
|
||||
use serde_json::json;
|
||||
let patch = json!({ "spec": { "replicas": replicas } });
|
||||
api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn delete_deployment(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<(), Error> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
api.delete(name, &kube::api::DeleteParams::default())
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn wait_until_deployment_ready(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
timeout: Option<Duration>,
|
||||
) -> Result<(), String> {
|
||||
let api: Api<Deployment> = match namespace {
|
||||
Some(ns) => Api::namespaced(self.client.clone(), ns),
|
||||
None => Api::default_namespaced(self.client.clone()),
|
||||
};
|
||||
let timeout = timeout.unwrap_or(Duration::from_secs(120));
|
||||
let establish = await_condition(api, name, conditions::is_deployment_completed());
|
||||
tokio::time::timeout(timeout, establish)
|
||||
.await
|
||||
.map(|_| ())
|
||||
.map_err(|_| "Timed out waiting for deployment".to_string())
|
||||
}
|
||||
|
||||
/// Gets a single named resource, using the correct API scope for `K`.
|
||||
pub async fn get_resource<K>(
|
||||
&self,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<Option<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let api: Api<K> =
|
||||
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||
api.get_opt(name).await
|
||||
}
|
||||
|
||||
pub async fn list_resources<K>(
|
||||
&self,
|
||||
namespace: Option<&str>,
|
||||
list_params: Option<ListParams>,
|
||||
) -> Result<ObjectList<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
let api: Api<K> =
|
||||
<<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
|
||||
api.list(&list_params.unwrap_or_default()).await
|
||||
}
|
||||
|
||||
pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
Api::<K>::all(self.client.clone())
|
||||
.list(&ListParams::default().labels(labels))
|
||||
.await
|
||||
.map(|l| l.items)
|
||||
}
|
||||
|
||||
pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
|
||||
where
|
||||
K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
|
||||
<K as Resource>::Scope: ScopeResolver<K>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
Api::<K>::all(self.client.clone())
|
||||
.list(&Default::default())
|
||||
.await
|
||||
.map(|l| l.items)
|
||||
}
|
||||
|
||||
pub async fn get_nodes(
|
||||
&self,
|
||||
list_params: Option<ListParams>,
|
||||
) -> Result<ObjectList<Node>, Error> {
|
||||
self.list_resources(None, list_params).await
|
||||
}
|
||||
}
|
||||
100
harmony-k8s/src/types.rs
Normal file
100
harmony-k8s/src/types.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
|
||||
use kube::{Api, Client, Resource};
|
||||
use serde::Serialize;
|
||||
|
||||
/// Which Kubernetes distribution is running. Detected once at runtime via
|
||||
/// [`crate::discovery::K8sClient::get_k8s_distribution`].
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub enum KubernetesDistribution {
|
||||
Default,
|
||||
OpenshiftFamily,
|
||||
K3sFamily,
|
||||
}
|
||||
|
||||
/// A file to be written to a node's filesystem.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeFile {
|
||||
/// Absolute path on the host where the file should be written.
|
||||
pub path: String,
|
||||
/// Content of the file.
|
||||
pub content: String,
|
||||
/// UNIX permissions (e.g. `0o600`).
|
||||
pub mode: u32,
|
||||
}
|
||||
|
||||
/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DrainOptions {
|
||||
/// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
|
||||
/// Equivalent to `kubectl drain --delete-emptydir-data`.
|
||||
pub delete_emptydir_data: bool,
|
||||
/// Silently skip DaemonSet-managed pods instead of blocking the drain.
|
||||
/// Equivalent to `kubectl drain --ignore-daemonsets`.
|
||||
pub ignore_daemonsets: bool,
|
||||
/// Maximum wall-clock time to wait for all evictions to complete.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for DrainOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
delete_emptydir_data: false,
|
||||
ignore_daemonsets: true,
|
||||
timeout: Duration::from_secs(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DrainOptions {
|
||||
pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
|
||||
Self {
|
||||
delete_emptydir_data: true,
|
||||
ignore_daemonsets: true,
|
||||
..Self::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
|
||||
/// resource already exists (or does not).
|
||||
pub enum WriteMode {
|
||||
/// Server-side apply; create if absent, update if present (default).
|
||||
CreateOrUpdate,
|
||||
/// POST only; return an error if the resource already exists.
|
||||
Create,
|
||||
/// Server-side apply only; return an error if the resource does not exist.
|
||||
Update,
|
||||
}
|
||||
|
||||
// ── Scope resolution trait ───────────────────────────────────────────────────
|
||||
|
||||
/// Resolves the correct [`kube::Api`] for a resource type based on its scope
|
||||
/// (cluster-wide vs. namespace-scoped).
|
||||
pub trait ScopeResolver<K: Resource> {
|
||||
fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
|
||||
}
|
||||
|
||||
impl<K> ScopeResolver<K> for ClusterResourceScope
|
||||
where
|
||||
K: Resource<Scope = ClusterResourceScope>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
|
||||
Api::all(client.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> ScopeResolver<K> for NamespaceResourceScope
|
||||
where
|
||||
K: Resource<Scope = NamespaceResourceScope>,
|
||||
<K as Resource>::DynamicType: Default,
|
||||
{
|
||||
fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
|
||||
match ns {
|
||||
Some(ns) => Api::namespaced(client.clone(), ns),
|
||||
None => Api::default_namespaced(client.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,8 @@ semver = "1.0.23"
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-retry.workspace = true
|
||||
tokio-util.workspace = true
|
||||
derive-new.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
@@ -31,6 +33,7 @@ opnsense-config-xml = { path = "../opnsense-config-xml" }
|
||||
harmony_macros = { path = "../harmony_macros" }
|
||||
harmony_types = { path = "../harmony_types" }
|
||||
harmony_execution = { path = "../harmony_execution" }
|
||||
harmony-k8s = { path = "../harmony-k8s" }
|
||||
uuid.workspace = true
|
||||
url.workspace = true
|
||||
kube = { workspace = true, features = ["derive"] }
|
||||
@@ -60,7 +63,6 @@ temp-dir = "0.1.14"
|
||||
dyn-clone = "1.0.19"
|
||||
similar.workspace = true
|
||||
futures-util = "0.3.31"
|
||||
tokio-util = "0.7.15"
|
||||
strum = { version = "0.27.1", features = ["derive"] }
|
||||
tempfile.workspace = true
|
||||
serde_with = "3.14.0"
|
||||
@@ -80,7 +82,7 @@ sqlx.workspace = true
|
||||
inquire.workspace = true
|
||||
brocade = { path = "../brocade" }
|
||||
option-ext = "0.2.0"
|
||||
tokio-retry = "0.3.0"
|
||||
rand.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions.workspace = true
|
||||
|
||||
@@ -4,8 +4,6 @@ use std::error::Error;
|
||||
use async_trait::async_trait;
|
||||
use derive_new::new;
|
||||
|
||||
use crate::inventory::HostRole;
|
||||
|
||||
use super::{
|
||||
data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
|
||||
};
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::{
|
||||
id::Id,
|
||||
@@ -8,7 +9,7 @@ use harmony_types::{
|
||||
use log::debug;
|
||||
use log::info;
|
||||
|
||||
use crate::topology::PxeOptions;
|
||||
use crate::topology::{HelmCommand, PxeOptions};
|
||||
use crate::{data::FileContent, executors::ExecutorError, topology::node_exporter::NodeExporter};
|
||||
use crate::{infra::network_manager::OpenShiftNmStateNetworkManager, topology::PortConfig};
|
||||
|
||||
@@ -16,9 +17,12 @@ use super::{
|
||||
DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
|
||||
HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
|
||||
NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
|
||||
SwitchError, TftpServer, Topology, k8s::K8sClient,
|
||||
SwitchError, TftpServer, Topology,
|
||||
};
|
||||
use std::{
|
||||
process::Command,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HAClusterTopology {
|
||||
@@ -52,6 +56,30 @@ impl Topology for HAClusterTopology {
|
||||
}
|
||||
}
|
||||
|
||||
impl HelmCommand for HAClusterTopology {
|
||||
fn get_helm_command(&self) -> Command {
|
||||
let mut cmd = Command::new("helm");
|
||||
if let Some(k) = &self.kubeconfig {
|
||||
cmd.args(["--kubeconfig", k]);
|
||||
}
|
||||
|
||||
// FIXME we should support context anywhere there is a k8sclient
|
||||
// This likely belongs in the k8sclient itself and should be extracted to a separate
|
||||
// crate
|
||||
//
|
||||
// I feel like helm could very well be a feature of this external k8s client.
|
||||
//
|
||||
// Same for kustomize
|
||||
//
|
||||
// if let Some(c) = &self.k8s_context {
|
||||
// cmd.args(["--kube-context", c]);
|
||||
// }
|
||||
|
||||
info!("Using helm command {cmd:?}");
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl K8sclient for HAClusterTopology {
|
||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,8 @@
|
||||
use std::{collections::BTreeMap, process::Command, sync::Arc};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use base64::{Engine, engine::general_purpose};
|
||||
use harmony_k8s::{K8sClient, KubernetesDistribution};
|
||||
use harmony_types::rfc1123::Rfc1123Name;
|
||||
use k8s_openapi::api::{
|
||||
core::v1::{Pod, Secret},
|
||||
@@ -36,7 +38,6 @@ use crate::{
|
||||
use super::super::{
|
||||
DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
|
||||
PreparationOutcome, Topology,
|
||||
k8s::K8sClient,
|
||||
tenant::{
|
||||
TenantConfig, TenantManager,
|
||||
k8s::K8sTenantManager,
|
||||
@@ -53,13 +54,6 @@ struct K8sState {
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub enum KubernetesDistribution {
|
||||
OpenshiftFamily,
|
||||
K3sFamily,
|
||||
Default,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum K8sSource {
|
||||
LocalK3d,
|
||||
|
||||
@@ -64,26 +64,28 @@ impl Observability<RedHatClusterObservability> for K8sAnywhereTopology {
|
||||
inventory: &Inventory,
|
||||
receivers: Option<Vec<Box<dyn AlertReceiver<RedHatClusterObservability>>>>,
|
||||
) -> Result<PreparationOutcome, PreparationError> {
|
||||
if let Some(receivers) = receivers {
|
||||
for receiver in receivers {
|
||||
info!("Installing receiver {}", receiver.name());
|
||||
let receiver_score = RedHatClusterObservabilityReceiverScore {
|
||||
receiver,
|
||||
sender: sender.clone(),
|
||||
};
|
||||
receiver_score
|
||||
.create_interpret()
|
||||
.execute(inventory, self)
|
||||
.await
|
||||
.map_err(|e| PreparationError::new(e.to_string()))?;
|
||||
}
|
||||
Ok(PreparationOutcome::Success {
|
||||
details: "Successfully installed receivers for OpenshiftClusterMonitoring"
|
||||
.to_string(),
|
||||
})
|
||||
} else {
|
||||
Ok(PreparationOutcome::Noop)
|
||||
let receivers = match receivers {
|
||||
Some(r) if !r.is_empty() => r,
|
||||
_ => return Ok(PreparationOutcome::Noop),
|
||||
};
|
||||
|
||||
for receiver in receivers {
|
||||
info!("Installing receiver {}", receiver.name());
|
||||
|
||||
let receiver_score = RedHatClusterObservabilityReceiverScore {
|
||||
receiver,
|
||||
sender: sender.clone(),
|
||||
};
|
||||
receiver_score
|
||||
.create_interpret()
|
||||
.execute(inventory, self)
|
||||
.await
|
||||
.map_err(|e| PreparationError::new(e.to_string()))?;
|
||||
}
|
||||
|
||||
Ok(PreparationOutcome::Success {
|
||||
details: "Successfully installed receivers for OpenshiftClusterMonitoring".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn install_rules(
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::{
|
||||
interpret::Outcome,
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{
|
||||
K8sPostgreSQLScore,
|
||||
|
||||
@@ -16,7 +16,6 @@ pub mod tenant;
|
||||
use derive_new::new;
|
||||
pub use k8s_anywhere::*;
|
||||
pub use localhost::*;
|
||||
pub mod k8s;
|
||||
mod load_balancer;
|
||||
pub mod router;
|
||||
mod tftp;
|
||||
|
||||
@@ -64,8 +64,7 @@ pub trait Observability<S: AlertSender> {
|
||||
/// Defines the entity that receives the alerts from a sender. For example Discord, Slack, etc
|
||||
///
|
||||
pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError>;
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError>;
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError>;
|
||||
fn name(&self) -> String;
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
|
||||
}
|
||||
@@ -176,6 +175,29 @@ impl<S: AlertSender> Clone for Box<dyn ScrapeTarget<S>> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ReceiverInstallPlan {
|
||||
pub install_operation: Option<Vec<InstallOperation>>,
|
||||
pub route: Option<AlertRoute>,
|
||||
pub receiver: Option<serde_yaml::Value>,
|
||||
}
|
||||
|
||||
impl Default for ReceiverInstallPlan {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
install_operation: None,
|
||||
route: None,
|
||||
receiver: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum InstallOperation {
|
||||
CreateSecret {
|
||||
name: String,
|
||||
data: BTreeMap<String, String>,
|
||||
},
|
||||
}
|
||||
|
||||
///Generic routing that can map to various alert sender backends
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct AlertRoute {
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::{
|
||||
use async_trait::async_trait;
|
||||
use brocade::PortOperatingMode;
|
||||
use derive_new::new;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::{
|
||||
id::Id,
|
||||
net::{IpAddress, MacAddress},
|
||||
@@ -18,7 +19,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::executors::ExecutorError;
|
||||
|
||||
use super::{LogicalHost, k8s::K8sClient};
|
||||
use super::LogicalHost;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DHCPStaticEntry {
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
executors::ExecutorError,
|
||||
topology::k8s::{ApplyStrategy, K8sClient},
|
||||
};
|
||||
use crate::executors::ExecutorError;
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use k8s_openapi::{
|
||||
api::{
|
||||
core::v1::{LimitRange, Namespace, ResourceQuota},
|
||||
@@ -14,7 +12,7 @@ use k8s_openapi::{
|
||||
},
|
||||
apimachinery::pkg::util::intstr::IntOrString,
|
||||
};
|
||||
use kube::{Resource, api::DynamicObject};
|
||||
use kube::Resource;
|
||||
use log::debug;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::json;
|
||||
@@ -59,7 +57,6 @@ impl K8sTenantManager {
|
||||
) -> Result<K, ExecutorError>
|
||||
where
|
||||
<K as kube::Resource>::DynamicType: Default,
|
||||
<K as kube::Resource>::Scope: ApplyStrategy<K>,
|
||||
{
|
||||
self.apply_labels(&mut resource, config);
|
||||
self.k8s_client
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{
|
||||
|
||||
use askama::Template;
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::{DrainOptions, K8sClient, NodeFile};
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::Node;
|
||||
use kube::{
|
||||
@@ -15,10 +16,7 @@ use log::{debug, info, warn};
|
||||
|
||||
use crate::{
|
||||
modules::okd::crd::nmstate,
|
||||
topology::{
|
||||
HostNetworkConfig, NetworkError, NetworkManager,
|
||||
k8s::{DrainOptions, K8sClient, NodeFile},
|
||||
},
|
||||
topology::{HostNetworkConfig, NetworkError, NetworkManager},
|
||||
};
|
||||
|
||||
/// NetworkManager bond configuration template
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, info, trace};
|
||||
use log::{debug, info};
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_macros::hurl;
|
||||
use log::{debug, info, trace, warn};
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
@@ -14,7 +15,7 @@ use crate::{
|
||||
helm::chart::{HelmChartScore, HelmRepository},
|
||||
},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress, k8s::K8sClient},
|
||||
topology::{HelmCommand, K8sclient, Topology, ingress::Ingress},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::modules::monitoring::prometheus::Prometheus;
|
||||
use crate::modules::monitoring::prometheus::helm::prometheus_config::PrometheusConfig;
|
||||
use crate::topology::MultiTargetTopology;
|
||||
use crate::topology::ingress::Ingress;
|
||||
use crate::topology::monitoring::AlertReceiver;
|
||||
use crate::topology::monitoring::Observability;
|
||||
use crate::topology::monitoring::{AlertReceiver, AlertRoute};
|
||||
use crate::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::{
|
||||
@@ -30,6 +30,7 @@ use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
//TODO test this
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Monitoring {
|
||||
pub application: Arc<dyn Application>,
|
||||
@@ -79,7 +80,7 @@ impl<
|
||||
};
|
||||
let ntfy = NtfyScore {
|
||||
namespace: namespace.clone(),
|
||||
host: domain,
|
||||
host: domain.clone(),
|
||||
};
|
||||
ntfy.interpret(&Inventory::empty(), topology)
|
||||
.await
|
||||
@@ -100,26 +101,33 @@ impl<
|
||||
|
||||
debug!("ntfy_default_auth_param: {ntfy_default_auth_param}");
|
||||
|
||||
debug!("ntfy_default_auth_param: {ntfy_default_auth_param}");
|
||||
let ntfy_receiver = WebhookReceiver {
|
||||
name: "ntfy-webhook".to_string(),
|
||||
url: Url::Url(
|
||||
url::Url::parse(
|
||||
format!(
|
||||
"http://ntfy.{}.svc.cluster.local/rust-web-app?auth={ntfy_default_auth_param}",
|
||||
namespace.clone()
|
||||
"http://{domain}/{}?auth={ntfy_default_auth_param}",
|
||||
__self.application.name()
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
route: AlertRoute {
|
||||
..AlertRoute::default("ntfy-webhook".to_string())
|
||||
},
|
||||
};
|
||||
|
||||
todo!();
|
||||
// alerting_score.receivers.push(Box::new(ntfy_receiver));
|
||||
// alerting_score
|
||||
// .interpret(&Inventory::empty(), topology)
|
||||
// .await
|
||||
// .map_err(|e| e.to_string())?;
|
||||
debug!(
|
||||
"ntfy webhook receiver \n{:#?}\nntfy topic: {}",
|
||||
ntfy_receiver.clone(),
|
||||
self.application.name()
|
||||
);
|
||||
alerting_score.receivers.push(Box::new(ntfy_receiver));
|
||||
alerting_score
|
||||
.interpret(&Inventory::empty(), topology)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
Ok(InstallationOutcome::success())
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObse
|
||||
use crate::modules::monitoring::red_hat_cluster_observability::redhat_cluster_observability::RedHatClusterObservabilityScore;
|
||||
use crate::topology::MultiTargetTopology;
|
||||
use crate::topology::ingress::Ingress;
|
||||
use crate::topology::monitoring::AlertReceiver;
|
||||
use crate::topology::monitoring::Observability;
|
||||
use crate::topology::monitoring::{AlertReceiver, AlertRoute};
|
||||
use crate::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::{
|
||||
@@ -97,12 +97,15 @@ impl<
|
||||
url::Url::parse(
|
||||
format!(
|
||||
"http://{domain}/{}?auth={ntfy_default_auth_param}",
|
||||
self.application.name()
|
||||
__self.application.name()
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
route: AlertRoute {
|
||||
..AlertRoute::default("ntfy-webhook".to_string())
|
||||
},
|
||||
};
|
||||
debug!(
|
||||
"ntfy webhook receiver \n{:#?}\nntfy topic: {}",
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use harmony_k8s::K8sClient;
|
||||
use log::{debug, info};
|
||||
|
||||
use crate::{interpret::InterpretError, topology::k8s::K8sClient};
|
||||
use crate::interpret::InterpretError;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub enum ArgoScope {
|
||||
|
||||
@@ -44,6 +44,12 @@ pub struct BrocadeSwitchAuth {
|
||||
pub password: String,
|
||||
}
|
||||
|
||||
impl BrocadeSwitchAuth {
|
||||
pub fn user_pass(username: String, password: String) -> Self {
|
||||
Self { username, password }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
|
||||
pub struct BrocadeSnmpAuth {
|
||||
pub username: String,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use harmony_k8s::K8sClient;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -11,7 +12,7 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
||||
@@ -54,6 +54,12 @@ pub enum HarmonyDiscoveryStrategy {
|
||||
SUBNET { cidr: cidr::Ipv4Cidr, port: u16 },
|
||||
}
|
||||
|
||||
impl Default for HarmonyDiscoveryStrategy {
|
||||
fn default() -> Self {
|
||||
HarmonyDiscoveryStrategy::MDNS
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology> Interpret<T> for DiscoverInventoryAgentInterpret {
|
||||
async fn execute(
|
||||
|
||||
@@ -3,7 +3,8 @@ use std::sync::Arc;
|
||||
use async_trait::async_trait;
|
||||
use log::warn;
|
||||
|
||||
use crate::topology::{FailoverTopology, K8sclient, k8s::K8sClient};
|
||||
use crate::topology::{FailoverTopology, K8sclient};
|
||||
use harmony_k8s::K8sClient;
|
||||
|
||||
#[async_trait]
|
||||
impl<T: K8sclient> K8sclient for FailoverTopology<T> {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use async_trait::async_trait;
|
||||
use k8s_openapi::NamespaceResourceScope;
|
||||
use kube::Resource;
|
||||
use log::info;
|
||||
use serde::{Serialize, de::DeserializeOwned};
|
||||
@@ -9,7 +8,7 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::ApplyStrategy},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
|
||||
@@ -42,7 +41,6 @@ impl<
|
||||
> Score<T> for K8sResourceScore<K>
|
||||
where
|
||||
<K as kube::Resource>::DynamicType: Default,
|
||||
<K as kube::Resource>::Scope: ApplyStrategy<K>,
|
||||
{
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(K8sResourceInterpret {
|
||||
@@ -74,7 +72,6 @@ impl<
|
||||
> Interpret<T> for K8sResourceInterpret<K>
|
||||
where
|
||||
<K as kube::Resource>::DynamicType: Default,
|
||||
<K as kube::Resource>::Scope: ApplyStrategy<K>,
|
||||
{
|
||||
async fn execute(
|
||||
&self,
|
||||
@@ -111,7 +108,7 @@ where
|
||||
topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.expect("Environment should provide enough information to instanciate a client")
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
|
||||
.apply_many(&self.score.resource, self.score.namespace.as_deref())
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -15,9 +15,12 @@ pub mod load_balancer;
|
||||
pub mod monitoring;
|
||||
pub mod nats;
|
||||
pub mod network;
|
||||
pub mod node_health;
|
||||
pub mod okd;
|
||||
pub mod openbao;
|
||||
pub mod opnsense;
|
||||
pub mod postgresql;
|
||||
pub mod storage;
|
||||
pub mod tenant;
|
||||
pub mod tftp;
|
||||
pub mod zitadel;
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::KubePrometheus;
|
||||
use crate::modules::monitoring::okd::OpenshiftClusterAlertSender;
|
||||
use crate::modules::monitoring::red_hat_cluster_observability::RedHatClusterObservability;
|
||||
use crate::topology::monitoring::{AlertRoute, MatchOp};
|
||||
use crate::topology::monitoring::{AlertRoute, InstallOperation, ReceiverInstallPlan};
|
||||
use crate::{interpret::InterpretError, topology::monitoring::AlertReceiver};
|
||||
use harmony_types::net::Url;
|
||||
use k8s_openapi::api::core::v1::Secret;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct DiscordReceiver {
|
||||
@@ -18,27 +16,7 @@ pub struct DiscordReceiver {
|
||||
}
|
||||
|
||||
impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordReceiver {
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let matchers: Vec<String> = self
|
||||
.route
|
||||
.matchers
|
||||
.iter()
|
||||
.map(|m| match m.operator {
|
||||
MatchOp::Eq => format!("{} = {}", m.label, m.value),
|
||||
MatchOp::NotEq => format!("{} != {}", m.label, m.value),
|
||||
MatchOp::Regex => format!("{} =~ {}", m.label, m.value),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let route_block = serde_yaml::to_value(json!({
|
||||
"receiver": self.name,
|
||||
"matchers": matchers,
|
||||
}))
|
||||
.unwrap();
|
||||
Ok(route_block)
|
||||
}
|
||||
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError> {
|
||||
let receiver_block = serde_yaml::to_value(json!({
|
||||
"name": self.name,
|
||||
"discord_configs": [{
|
||||
@@ -48,7 +26,12 @@ impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordReceiver {
|
||||
}]
|
||||
}))
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(receiver_block)
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver_block),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
@@ -61,29 +44,13 @@ impl AlertReceiver<OpenshiftClusterAlertSender> for DiscordReceiver {
|
||||
}
|
||||
|
||||
impl AlertReceiver<RedHatClusterObservability> for DiscordReceiver {
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
serde_yaml::to_value(&self.route).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
//FIXME this secret needs to be applied so that the discord Configs for RedHatCO
|
||||
//CRD AlertmanagerConfigs can access the URL
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError> {
|
||||
let secret_name = format!("{}-secret", self.name.clone());
|
||||
let webhook_key = format!("{}", self.url.clone());
|
||||
|
||||
let mut string_data = BTreeMap::new();
|
||||
string_data.insert("webhook-url".to_string(), webhook_key.clone());
|
||||
|
||||
let secret = Secret {
|
||||
metadata: kube::core::ObjectMeta {
|
||||
name: Some(secret_name.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
string_data: Some(string_data),
|
||||
type_: Some("Opaque".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let receiver_config = json!({
|
||||
"name": self.name,
|
||||
"discordConfigs": [
|
||||
@@ -97,7 +64,19 @@ impl AlertReceiver<RedHatClusterObservability> for DiscordReceiver {
|
||||
}
|
||||
]
|
||||
});
|
||||
serde_yaml::to_value(receiver_config).map_err(|e| InterpretError::new(e.to_string()))
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: Some(vec![InstallOperation::CreateSecret {
|
||||
name: secret_name,
|
||||
data: string_data,
|
||||
}]),
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(
|
||||
serde_yaml::to_value(receiver_config)
|
||||
.map_err(|e| InterpretError::new(e.to_string()))
|
||||
.expect("failed to build yaml value"),
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
@@ -110,11 +89,7 @@ impl AlertReceiver<RedHatClusterObservability> for DiscordReceiver {
|
||||
}
|
||||
|
||||
impl AlertReceiver<KubePrometheus> for DiscordReceiver {
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
serde_yaml::to_value(self.route.clone()).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
fn build(&self) -> Result<ReceiverInstallPlan, InterpretError> {
|
||||
let receiver_block = serde_yaml::to_value(json!({
|
||||
"name": self.name,
|
||||
"discord_configs": [{
|
||||
@@ -124,7 +99,12 @@ impl AlertReceiver<KubePrometheus> for DiscordReceiver {
|
||||
}]
|
||||
}))
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(receiver_block)
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver_block),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::{
|
||||
kube_prometheus::KubePrometheus, okd::OpenshiftClusterAlertSender, prometheus::Prometheus,
|
||||
red_hat_cluster_observability::RedHatClusterObservability,
|
||||
},
|
||||
topology::monitoring::AlertReceiver,
|
||||
topology::monitoring::{AlertReceiver, AlertRoute, ReceiverInstallPlan},
|
||||
};
|
||||
use harmony_types::net::Url;
|
||||
|
||||
@@ -15,6 +15,7 @@ use harmony_types::net::Url;
|
||||
pub struct WebhookReceiver {
|
||||
pub name: String,
|
||||
pub url: Url,
|
||||
pub route: AlertRoute,
|
||||
}
|
||||
|
||||
impl WebhookReceiver {
|
||||
@@ -40,16 +41,6 @@ impl WebhookReceiver {
|
||||
}
|
||||
|
||||
impl AlertReceiver<OpenshiftClusterAlertSender> for WebhookReceiver {
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let route = self.build_route();
|
||||
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
@@ -57,19 +48,21 @@ impl AlertReceiver<OpenshiftClusterAlertSender> for WebhookReceiver {
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<OpenshiftClusterAlertSender>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn build(&self) -> Result<crate::topology::monitoring::ReceiverInstallPlan, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
let receiver =
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertReceiver<RedHatClusterObservability> for WebhookReceiver {
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let route = self.build_route();
|
||||
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
@@ -77,19 +70,21 @@ impl AlertReceiver<RedHatClusterObservability> for WebhookReceiver {
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<RedHatClusterObservability>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn build(&self) -> Result<crate::topology::monitoring::ReceiverInstallPlan, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
let receiver =
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertReceiver<KubePrometheus> for WebhookReceiver {
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let route = self.build_route();
|
||||
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
@@ -97,19 +92,21 @@ impl AlertReceiver<KubePrometheus> for WebhookReceiver {
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn build(&self) -> Result<crate::topology::monitoring::ReceiverInstallPlan, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
let receiver =
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl AlertReceiver<Prometheus> for WebhookReceiver {
|
||||
fn build_receiver(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn build_route(&self) -> Result<serde_yaml::Value, InterpretError> {
|
||||
let route = self.build_route();
|
||||
serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
@@ -117,4 +114,16 @@ impl AlertReceiver<Prometheus> for WebhookReceiver {
|
||||
fn clone_box(&self) -> Box<dyn AlertReceiver<Prometheus>> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn build(&self) -> Result<crate::topology::monitoring::ReceiverInstallPlan, InterpretError> {
|
||||
let receiver = self.build_receiver();
|
||||
let receiver =
|
||||
serde_yaml::to_value(receiver).map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
Ok(ReceiverInstallPlan {
|
||||
install_operation: None,
|
||||
route: Some(self.route.clone()),
|
||||
receiver: Some(receiver),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,6 @@
|
||||
use async_trait::async_trait;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
inventory::Inventory,
|
||||
topology::{
|
||||
PreparationError, PreparationOutcome,
|
||||
monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget},
|
||||
},
|
||||
};
|
||||
use crate::topology::monitoring::{AlertReceiver, AlertRule, AlertSender, ScrapeTarget};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct Grafana {
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{
|
||||
modules::monitoring::grafana::grafana::Grafana,
|
||||
score::Score,
|
||||
topology::{
|
||||
HelmCommand, Topology,
|
||||
Topology,
|
||||
monitoring::{AlertReceiver, AlertRule, AlertingInterpret, Observability, ScrapeTarget},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::Interpret,
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::monitoring::grafana::grafana::Grafana,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -12,12 +16,42 @@ pub struct GrafanaK8sInstallScore {
|
||||
pub sender: Grafana,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for GrafanaK8sInstallScore {
|
||||
impl<T: Topology + K8sclient + HelmCommand> Score<T> for GrafanaK8sInstallScore {
|
||||
fn name(&self) -> String {
|
||||
"GrafanaK8sEnsureReadyScore".to_string()
|
||||
}
|
||||
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(GrafanaK8sInstallInterpret {})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct GrafanaK8sInstallInterpret {}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient + HelmCommand> Interpret<T> for GrafanaK8sInstallInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("GrafanaK8sInstallInterpret")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,11 +28,16 @@ impl<T: Topology + K8sclient> Score<T> for KubePrometheusReceiverScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
let name = self.receiver.name();
|
||||
let namespace = self.sender.config.lock().unwrap().namespace.clone();
|
||||
let route = self.receiver.build_route().expect(&format!(
|
||||
let install_plan = self.receiver.build().expect("failed to build install plan");
|
||||
|
||||
let route = install_plan.route.expect(&format!(
|
||||
"failed to build route for receveiver {}",
|
||||
name.clone()
|
||||
));
|
||||
let receiver = self.receiver.build_receiver().expect(&format!(
|
||||
|
||||
let route = serde_yaml::to_value(route).expect("failed to serialize route object to yaml");
|
||||
|
||||
let receiver = install_plan.receiver.expect(&format!(
|
||||
"failed to build receiver path for receiver {}",
|
||||
name.clone()
|
||||
));
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::collections::{BTreeMap, HashMap};
|
||||
use async_trait::async_trait;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_yaml::{Mapping, Sequence, Value};
|
||||
use serde_yaml::Value;
|
||||
|
||||
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup;
|
||||
|
||||
|
||||
@@ -11,8 +11,9 @@ use crate::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::ntfy::helm::ntfy_helm_chart::ntfy_helm_chart_score,
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology, k8s::K8sClient},
|
||||
topology::{HelmCommand, K8sclient, MultiTargetTopology, Topology},
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
||||
88
harmony/src/modules/monitoring/okd/config.rs
Normal file
88
harmony/src/modules/monitoring/okd/config.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
|
||||
use crate::interpret::{InterpretError, Outcome};
|
||||
use harmony_k8s::K8sClient;
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use kube::api::ObjectMeta;
|
||||
|
||||
pub(crate) struct Config;
|
||||
|
||||
impl Config {
|
||||
pub async fn create_cluster_monitoring_config_cm(
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut data = BTreeMap::new();
|
||||
data.insert(
|
||||
"config.yaml".to_string(),
|
||||
r#"
|
||||
enableUserWorkload: true
|
||||
alertmanagerMain:
|
||||
enableUserAlertmanagerConfig: true
|
||||
"#
|
||||
.to_string(),
|
||||
);
|
||||
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("cluster-monitoring-config".to_string()),
|
||||
namespace: Some("openshift-monitoring".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(data),
|
||||
..Default::default()
|
||||
};
|
||||
client.apply(&cm, Some("openshift-monitoring")).await?;
|
||||
|
||||
Ok(Outcome::success(
|
||||
"updated cluster-monitoring-config-map".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
pub async fn create_user_workload_monitoring_config_cm(
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut data = BTreeMap::new();
|
||||
data.insert(
|
||||
"config.yaml".to_string(),
|
||||
r#"
|
||||
alertmanager:
|
||||
enabled: true
|
||||
enableAlertmanagerConfig: true
|
||||
"#
|
||||
.to_string(),
|
||||
);
|
||||
let cm = ConfigMap {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("user-workload-monitoring-config".to_string()),
|
||||
namespace: Some("openshift-user-workload-monitoring".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
data: Some(data),
|
||||
..Default::default()
|
||||
};
|
||||
client
|
||||
.apply(&cm, Some("openshift-user-workload-monitoring"))
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success(
|
||||
"updated openshift-user-monitoring-config-map".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
pub async fn verify_user_workload(client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
|
||||
let namespace = "openshift-user-workload-monitoring";
|
||||
let alertmanager_name = "alertmanager-user-workload-0";
|
||||
let prometheus_name = "prometheus-user-workload-0";
|
||||
client
|
||||
.wait_for_pod_ready(alertmanager_name, Some(namespace))
|
||||
.await?;
|
||||
client
|
||||
.wait_for_pod_ready(prometheus_name, Some(namespace))
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"pods: {}, {} ready in ns: {}",
|
||||
alertmanager_name, prometheus_name, namespace
|
||||
)))
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use kube::api::{GroupVersionKind, ObjectMeta};
|
||||
@@ -13,7 +14,7 @@ use crate::{
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
||||
@@ -10,7 +10,10 @@ use crate::{
|
||||
inventory::Inventory,
|
||||
modules::monitoring::okd::OpenshiftClusterAlertSender,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, monitoring::AlertReceiver},
|
||||
topology::{
|
||||
K8sclient, Topology,
|
||||
monitoring::{AlertReceiver, AlertRoute, MatchOp},
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -69,8 +72,16 @@ impl<T: Topology + K8sclient> Interpret<T> for OpenshiftReceiverInterpret {
|
||||
.unwrap_or_else(|_| serde_yaml::Value::Mapping(serde_yaml::Mapping::new()));
|
||||
|
||||
let name = self.receiver.name();
|
||||
let receiver = self.receiver.build_receiver()?;
|
||||
let route = self.receiver.build_route().unwrap();
|
||||
let install_plan = self.receiver.build().expect("failed to build install plan");
|
||||
let receiver = install_plan.receiver.expect("unable to find receiver path");
|
||||
|
||||
let alert_route = install_plan
|
||||
.route
|
||||
.ok_or_else(|| InterpretError::new("missing route".into()))?;
|
||||
|
||||
let route = self.serialize_route(&alert_route);
|
||||
|
||||
let route = serde_yaml::to_value(route).map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
if am_config.get("receivers").is_none() {
|
||||
am_config["receivers"] = serde_yaml::Value::Sequence(vec![]);
|
||||
@@ -142,3 +153,61 @@ impl<T: Topology + K8sclient> Interpret<T> for OpenshiftReceiverInterpret {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
impl OpenshiftReceiverInterpret {
|
||||
fn serialize_route(&self, route: &AlertRoute) -> serde_yaml::Value {
|
||||
// Convert matchers
|
||||
let matchers: Vec<String> = route
|
||||
.matchers
|
||||
.iter()
|
||||
.map(|m| match m.operator {
|
||||
MatchOp::Eq => format!("{} = {}", m.label, m.value),
|
||||
MatchOp::NotEq => format!("{} != {}", m.label, m.value),
|
||||
MatchOp::Regex => format!("{} =~ {}", m.label, m.value),
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Recursively convert children routes
|
||||
let children: Vec<serde_yaml::Value> = route
|
||||
.children
|
||||
.iter()
|
||||
.map(|c| self.serialize_route(c))
|
||||
.collect();
|
||||
|
||||
// Build the YAML object for this route
|
||||
let mut route_map = serde_yaml::Mapping::new();
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("receiver".to_string()),
|
||||
serde_yaml::Value::String(route.receiver.clone()),
|
||||
);
|
||||
if !matchers.is_empty() {
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("matchers".to_string()),
|
||||
serde_yaml::to_value(matchers).unwrap(),
|
||||
);
|
||||
}
|
||||
if !route.group_by.is_empty() {
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("group_by".to_string()),
|
||||
serde_yaml::to_value(route.group_by.clone()).unwrap(),
|
||||
);
|
||||
}
|
||||
if let Some(ref interval) = route.repeat_interval {
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("repeat_interval".to_string()),
|
||||
serde_yaml::Value::String(interval.clone()),
|
||||
);
|
||||
}
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("continue".to_string()),
|
||||
serde_yaml::Value::Bool(route.continue_matching),
|
||||
);
|
||||
if !children.is_empty() {
|
||||
route_map.insert(
|
||||
serde_yaml::Value::String("routes".to_string()),
|
||||
serde_yaml::Value::Sequence(children),
|
||||
);
|
||||
}
|
||||
|
||||
serde_yaml::Value::Mapping(route_map)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,9 +6,10 @@ use crate::{
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::ConfigMap;
|
||||
use kube::api::{GroupVersionKind, ObjectMeta};
|
||||
|
||||
@@ -30,15 +30,16 @@ impl<T: Topology + K8sclient> Score<T> for PrometheusReceiverScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
let name = self.receiver.name();
|
||||
let namespace = self.sender.config.lock().unwrap().namespace.clone();
|
||||
let route = self.receiver.build_route().expect(&format!(
|
||||
"failed to build route for receveiver {}",
|
||||
name.clone()
|
||||
));
|
||||
|
||||
let receiver = self.receiver.build_receiver().expect(&format!(
|
||||
"failed to build receiver path for receiver {}",
|
||||
name.clone()
|
||||
));
|
||||
let install_plan = self.receiver.build().expect("failed to build install plan");
|
||||
|
||||
let route = install_plan.route;
|
||||
|
||||
let route = serde_json::to_value(route).expect("failed to serialize to json");
|
||||
|
||||
let receiver = install_plan
|
||||
.receiver
|
||||
.expect("failed to find receiver mapping");
|
||||
|
||||
let data = serde_json::json!({
|
||||
"route": route,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::core::v1::Secret;
|
||||
use kube::api::ObjectMeta;
|
||||
use serde::Serialize;
|
||||
|
||||
@@ -15,7 +16,10 @@ use crate::{
|
||||
},
|
||||
},
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, monitoring::AlertReceiver},
|
||||
topology::{
|
||||
K8sclient, Topology,
|
||||
monitoring::{AlertReceiver, InstallOperation},
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -51,9 +55,36 @@ impl<T: Topology + K8sclient> Interpret<T> for RedHatClusterObservabilityReceive
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let name = self.receiver.name();
|
||||
let route = self.receiver.build_route()?;
|
||||
let receiver = self.receiver.build_receiver()?;
|
||||
let namespace = self.sender.namespace.clone();
|
||||
|
||||
let client = topology.k8s_client().await?;
|
||||
let install_plan = self.receiver.build()?;
|
||||
|
||||
let install_operation = install_plan.install_operation.unwrap_or_default();
|
||||
|
||||
for operation in install_operation {
|
||||
match operation {
|
||||
InstallOperation::CreateSecret { name, data } => {
|
||||
let secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name),
|
||||
..Default::default()
|
||||
},
|
||||
string_data: Some(data),
|
||||
type_: Some("Opaque".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
client.apply(&secret, Some(&namespace)).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let route = install_plan.route.unwrap();
|
||||
let route = serde_json::to_value(route)
|
||||
.map_err(|e| InterpretError::new(e.to_string()))
|
||||
.expect("failed to serialize alert route");
|
||||
let receiver = install_plan.receiver.unwrap();
|
||||
let data = serde_json::json!({
|
||||
"route": route,
|
||||
"receivers": [receiver]
|
||||
|
||||
@@ -27,7 +27,7 @@ impl Default for PrometheusNodeExporter {
|
||||
metrics_path: "/metrics".into(),
|
||||
scrape_interval: None,
|
||||
scrape_timeout: None,
|
||||
listen_address: ip!("127.0.0.1"),
|
||||
listen_address: ip!("192.168.1.1"),
|
||||
port: 9100,
|
||||
labels: None,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::{collections::BTreeMap, str::FromStr};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::KubernetesDistribution;
|
||||
use harmony_macros::hurl;
|
||||
use harmony_secret::{Secret, SecretManager};
|
||||
use harmony_types::id::Id;
|
||||
@@ -25,7 +26,7 @@ use crate::{
|
||||
},
|
||||
},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, KubernetesDistribution, TlsRouter, Topology},
|
||||
topology::{HelmCommand, K8sclient, TlsRouter, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
||||
260
harmony/src/modules/node_health/mod.rs
Normal file
260
harmony/src/modules/node_health/mod.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use k8s_openapi::api::{
|
||||
apps::v1::{DaemonSet, DaemonSetSpec},
|
||||
core::v1::{
|
||||
Container, ContainerPort, EnvVar, EnvVarSource, Namespace, ObjectFieldSelector, PodSpec,
|
||||
PodTemplateSpec, ResourceRequirements, ServiceAccount, Toleration,
|
||||
},
|
||||
rbac::v1::{ClusterRole, ClusterRoleBinding, PolicyRule, Role, RoleBinding, RoleRef, Subject},
|
||||
};
|
||||
use k8s_openapi::apimachinery::pkg::api::resource::Quantity;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
|
||||
use kube::api::ObjectMeta;
|
||||
use serde::Serialize;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct NodeHealthScore {}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for NodeHealthScore {
|
||||
fn name(&self) -> String {
|
||||
format!("NodeHealthScore")
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(NodeHealthInterpret {})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeHealthInterpret {}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for NodeHealthInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let namespace_name = "harmony-node-healthcheck".to_string();
|
||||
|
||||
// Namespace
|
||||
let mut labels = BTreeMap::new();
|
||||
labels.insert("name".to_string(), namespace_name.clone());
|
||||
|
||||
let namespace = Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(namespace_name.clone()),
|
||||
labels: Some(labels),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..Namespace::default()
|
||||
};
|
||||
|
||||
// ServiceAccount
|
||||
let service_account_name = "node-healthcheck-sa".to_string();
|
||||
let service_account = ServiceAccount {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(service_account_name.clone()),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
..ServiceAccount::default()
|
||||
};
|
||||
|
||||
// ClusterRole
|
||||
let cluster_role = ClusterRole {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("node-healthcheck-role".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["".to_string()]),
|
||||
resources: Some(vec!["nodes".to_string()]),
|
||||
verbs: vec!["get".to_string(), "list".to_string()],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..ClusterRole::default()
|
||||
};
|
||||
|
||||
// Role
|
||||
let role = Role {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("allow-hostnetwork-scc".to_string()),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
rules: Some(vec![PolicyRule {
|
||||
api_groups: Some(vec!["security.openshift.io".to_string()]),
|
||||
resources: Some(vec!["securitycontextconstraints".to_string()]),
|
||||
resource_names: Some(vec!["hostnetwork".to_string()]),
|
||||
verbs: vec!["use".to_string()],
|
||||
..PolicyRule::default()
|
||||
}]),
|
||||
..Role::default()
|
||||
};
|
||||
|
||||
// RoleBinding
|
||||
let role_binding = RoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("node-status-querier-scc-binding".to_string()),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "Role".to_string(),
|
||||
name: "allow-hostnetwork-scc".to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
// ClusterRoleBinding
|
||||
let cluster_role_binding = ClusterRoleBinding {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("read-nodes-binding".to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
subjects: Some(vec![Subject {
|
||||
kind: "ServiceAccount".to_string(),
|
||||
name: service_account_name.clone(),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
..Subject::default()
|
||||
}]),
|
||||
role_ref: RoleRef {
|
||||
api_group: "rbac.authorization.k8s.io".to_string(),
|
||||
kind: "ClusterRole".to_string(),
|
||||
name: "node-healthcheck-role".to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
// DaemonSet
|
||||
let mut daemonset_labels = BTreeMap::new();
|
||||
daemonset_labels.insert("app".to_string(), "node-healthcheck".to_string());
|
||||
|
||||
let daemon_set = DaemonSet {
|
||||
metadata: ObjectMeta {
|
||||
name: Some("node-healthcheck".to_string()),
|
||||
namespace: Some(namespace_name.clone()),
|
||||
labels: Some(daemonset_labels.clone()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
spec: Some(DaemonSetSpec {
|
||||
selector: LabelSelector {
|
||||
match_labels: Some(daemonset_labels.clone()),
|
||||
..LabelSelector::default()
|
||||
},
|
||||
template: PodTemplateSpec {
|
||||
metadata: Some(ObjectMeta {
|
||||
labels: Some(daemonset_labels),
|
||||
..ObjectMeta::default()
|
||||
}),
|
||||
spec: Some(PodSpec {
|
||||
service_account_name: Some(service_account_name.clone()),
|
||||
host_network: Some(true),
|
||||
tolerations: Some(vec![Toleration {
|
||||
operator: Some("Exists".to_string()),
|
||||
..Toleration::default()
|
||||
}]),
|
||||
containers: vec![Container {
|
||||
name: "checker".to_string(),
|
||||
image: Some(
|
||||
"hub.nationtech.io/harmony/harmony-node-readiness-endpoint:latest"
|
||||
.to_string(),
|
||||
),
|
||||
env: Some(vec![EnvVar {
|
||||
name: "NODE_NAME".to_string(),
|
||||
value_from: Some(EnvVarSource {
|
||||
field_ref: Some(ObjectFieldSelector {
|
||||
field_path: "spec.nodeName".to_string(),
|
||||
..ObjectFieldSelector::default()
|
||||
}),
|
||||
..EnvVarSource::default()
|
||||
}),
|
||||
..EnvVar::default()
|
||||
}]),
|
||||
ports: Some(vec![ContainerPort {
|
||||
container_port: 25001,
|
||||
host_port: Some(25001),
|
||||
name: Some("health-port".to_string()),
|
||||
..ContainerPort::default()
|
||||
}]),
|
||||
resources: Some(ResourceRequirements {
|
||||
requests: Some({
|
||||
let mut requests = BTreeMap::new();
|
||||
requests.insert("cpu".to_string(), Quantity("10m".to_string()));
|
||||
requests
|
||||
.insert("memory".to_string(), Quantity("50Mi".to_string()));
|
||||
requests
|
||||
}),
|
||||
..ResourceRequirements::default()
|
||||
}),
|
||||
..Container::default()
|
||||
}],
|
||||
..PodSpec::default()
|
||||
}),
|
||||
},
|
||||
..DaemonSetSpec::default()
|
||||
}),
|
||||
..DaemonSet::default()
|
||||
};
|
||||
|
||||
K8sResourceScore::single(namespace, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(service_account, Some(namespace_name.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(role, Some(namespace_name.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(role_binding, Some(namespace_name.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(cluster_role_binding, None)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
K8sResourceScore::single(daemon_set, Some(namespace_name.clone()))
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success(
|
||||
"Harmony node health successfully deployed".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("NodeHealth")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use crate::{
|
||||
score::Score,
|
||||
topology::{
|
||||
BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer,
|
||||
LoadBalancerService, SSL, Topology,
|
||||
LoadBalancerService, Router, SSL, Topology,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -23,17 +23,45 @@ pub struct OKDLoadBalancerScore {
|
||||
load_balancer_score: LoadBalancerScore,
|
||||
}
|
||||
|
||||
/// OKD Load Balancer Score configuration
|
||||
///
|
||||
/// This module configures the load balancer for OKD (OpenShift Kubernetes Distribution)
|
||||
/// bare metal installations.
|
||||
///
|
||||
/// # Backend Server Configuration
|
||||
///
|
||||
/// For ports 80 and 443 (ingress traffic), the load balancer includes both control plane
|
||||
/// and worker nodes in the backend pool. This is consistent with OKD's requirement that
|
||||
/// ingress traffic should be load balanced across all nodes that may run ingress router pods.
|
||||
///
|
||||
/// For ports 22623 (Ignition API) and 6443 (Kubernetes API), only control plane nodes
|
||||
/// are included as backends, as these services are control plane specific.
|
||||
///
|
||||
/// # References
|
||||
///
|
||||
/// - [OKD Bare Metal Installation - External Load Balancer Configuration]
|
||||
/// (<https://docs.okd.io/latest/installing/installing_bare_metal/ipi/ipi-install-installation-workflow.html#nw-osp-configuring-external-load-balancer_ipi-install-installation-workflow>)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use harmony::topology::HAClusterTopology;
|
||||
/// use harmony::modules::okd::OKDLoadBalancerScore;
|
||||
///
|
||||
/// let topology: HAClusterTopology = /* get topology from your infrastructure */;
|
||||
/// let score = OKDLoadBalancerScore::new(&topology);
|
||||
/// ```
|
||||
impl OKDLoadBalancerScore {
|
||||
pub fn new(topology: &HAClusterTopology) -> Self {
|
||||
let public_ip = topology.router.get_gateway();
|
||||
let public_services = vec![
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||
listening_port: SocketAddr::new(public_ip, 80),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
},
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||
listening_port: SocketAddr::new(public_ip, 443),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
},
|
||||
@@ -41,12 +69,12 @@ impl OKDLoadBalancerScore {
|
||||
|
||||
let private_services = vec![
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 80),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 80),
|
||||
listening_port: SocketAddr::new(public_ip, 80),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
},
|
||||
LoadBalancerService {
|
||||
backend_servers: Self::control_plane_to_backend_server(topology, 443),
|
||||
backend_servers: Self::nodes_to_backend_server(topology, 443),
|
||||
listening_port: SocketAddr::new(public_ip, 443),
|
||||
health_check: Some(HealthCheck::TCP(None)),
|
||||
},
|
||||
@@ -74,6 +102,11 @@ impl OKDLoadBalancerScore {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates backend servers list for control plane nodes only
|
||||
///
|
||||
/// Use this for control plane-specific services like:
|
||||
/// - Port 22623: Ignition API (machine configuration during bootstrap)
|
||||
/// - Port 6443: Kubernetes API server
|
||||
fn control_plane_to_backend_server(
|
||||
topology: &HAClusterTopology,
|
||||
port: u16,
|
||||
@@ -87,6 +120,216 @@ impl OKDLoadBalancerScore {
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates backend servers list for all nodes (control plane + workers)
|
||||
///
|
||||
/// Use this for ingress traffic that should be distributed across all nodes:
|
||||
/// - Port 80: HTTP ingress traffic
|
||||
/// - Port 443: HTTPS ingress traffic
|
||||
///
|
||||
/// In OKD, ingress router pods can run on any node, so both control plane
|
||||
/// and worker nodes should be included in the load balancer backend pool.
|
||||
fn nodes_to_backend_server(topology: &HAClusterTopology, port: u16) -> Vec<BackendServer> {
|
||||
let mut nodes = Vec::new();
|
||||
for cp in &topology.control_plane {
|
||||
nodes.push(BackendServer {
|
||||
address: cp.ip.to_string(),
|
||||
port,
|
||||
});
|
||||
}
|
||||
for worker in &topology.workers {
|
||||
nodes.push(BackendServer {
|
||||
address: worker.ip.to_string(),
|
||||
port,
|
||||
});
|
||||
}
|
||||
nodes
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use super::*;
|
||||
use crate::topology::{DummyInfra, LogicalHost};
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::net::IpAddress;
|
||||
|
||||
fn create_test_topology() -> HAClusterTopology {
|
||||
let router = Arc::new(DummyRouter {
|
||||
gateway: ip!("192.168.1.1"),
|
||||
});
|
||||
|
||||
HAClusterTopology {
|
||||
domain_name: "test.example.com".to_string(),
|
||||
router,
|
||||
load_balancer: Arc::new(DummyInfra),
|
||||
firewall: Arc::new(DummyInfra),
|
||||
dhcp_server: Arc::new(DummyInfra),
|
||||
tftp_server: Arc::new(DummyInfra),
|
||||
http_server: Arc::new(DummyInfra),
|
||||
dns_server: Arc::new(DummyInfra),
|
||||
node_exporter: Arc::new(DummyInfra),
|
||||
switch_client: Arc::new(DummyInfra),
|
||||
bootstrap_host: LogicalHost {
|
||||
ip: ip!("192.168.1.100"),
|
||||
name: "bootstrap".to_string(),
|
||||
},
|
||||
control_plane: vec![
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.10"),
|
||||
name: "control-plane-0".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.11"),
|
||||
name: "control-plane-1".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.12"),
|
||||
name: "control-plane-2".to_string(),
|
||||
},
|
||||
],
|
||||
workers: vec![
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.20"),
|
||||
name: "worker-0".to_string(),
|
||||
},
|
||||
LogicalHost {
|
||||
ip: ip!("192.168.1.21"),
|
||||
name: "worker-1".to_string(),
|
||||
},
|
||||
],
|
||||
kubeconfig: None,
|
||||
network_manager: OnceLock::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct DummyRouter {
|
||||
gateway: IpAddress,
|
||||
}
|
||||
|
||||
impl Router for DummyRouter {
|
||||
fn get_gateway(&self) -> IpAddress {
|
||||
self.gateway
|
||||
}
|
||||
fn get_cidr(&self) -> cidr::Ipv4Cidr {
|
||||
let ipv4 = match self.gateway {
|
||||
IpAddress::V4(ip) => ip,
|
||||
IpAddress::V6(_) => panic!("IPv6 not supported"),
|
||||
};
|
||||
cidr::Ipv4Cidr::new(ipv4, 24).unwrap()
|
||||
}
|
||||
fn get_host(&self) -> LogicalHost {
|
||||
LogicalHost {
|
||||
ip: self.gateway,
|
||||
name: "router".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nodes_to_backend_server_includes_control_plane_and_workers() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 80);
|
||||
|
||||
assert_eq!(backend_servers.len(), 5);
|
||||
|
||||
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||
assert!(addresses.contains(&"192.168.1.10"));
|
||||
assert!(addresses.contains(&"192.168.1.11"));
|
||||
assert!(addresses.contains(&"192.168.1.12"));
|
||||
assert!(addresses.contains(&"192.168.1.20"));
|
||||
assert!(addresses.contains(&"192.168.1.21"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_control_plane_to_backend_server_only_includes_control_plane() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::control_plane_to_backend_server(&topology, 80);
|
||||
|
||||
assert_eq!(backend_servers.len(), 3);
|
||||
|
||||
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
|
||||
assert!(addresses.contains(&"192.168.1.10"));
|
||||
assert!(addresses.contains(&"192.168.1.11"));
|
||||
assert!(addresses.contains(&"192.168.1.12"));
|
||||
assert!(!addresses.contains(&"192.168.1.20"));
|
||||
assert!(!addresses.contains(&"192.168.1.21"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_public_services_include_all_nodes_on_port_80_and_443() {
|
||||
let topology = create_test_topology();
|
||||
let score = OKDLoadBalancerScore::new(&topology);
|
||||
|
||||
let public_service_80 = score
|
||||
.load_balancer_score
|
||||
.public_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 80)
|
||||
.expect("Public service on port 80 not found");
|
||||
|
||||
let public_service_443 = score
|
||||
.load_balancer_score
|
||||
.public_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 443)
|
||||
.expect("Public service on port 443 not found");
|
||||
|
||||
assert_eq!(public_service_80.backend_servers.len(), 5);
|
||||
assert_eq!(public_service_443.backend_servers.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_private_service_port_22623_only_control_plane() {
|
||||
let topology = create_test_topology();
|
||||
let score = OKDLoadBalancerScore::new(&topology);
|
||||
|
||||
let private_service_22623 = score
|
||||
.load_balancer_score
|
||||
.private_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 22623)
|
||||
.expect("Private service on port 22623 not found");
|
||||
|
||||
assert_eq!(private_service_22623.backend_servers.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_private_service_port_6443_only_control_plane() {
|
||||
let topology = create_test_topology();
|
||||
let score = OKDLoadBalancerScore::new(&topology);
|
||||
|
||||
let private_service_6443 = score
|
||||
.load_balancer_score
|
||||
.private_services
|
||||
.iter()
|
||||
.find(|s| s.listening_port.port() == 6443)
|
||||
.expect("Private service on port 6443 not found");
|
||||
|
||||
assert_eq!(private_service_6443.backend_servers.len(), 3);
|
||||
assert!(
|
||||
matches!(
|
||||
private_service_6443.health_check,
|
||||
Some(HealthCheck::HTTP(_, _, _, _))
|
||||
),
|
||||
"Expected HTTP health check for port 6443"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_backend_servers_have_correct_port() {
|
||||
let topology = create_test_topology();
|
||||
|
||||
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 443);
|
||||
|
||||
for server in backend_servers {
|
||||
assert_eq!(server.port, 443);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Topology + LoadBalancer> Score<T> for OKDLoadBalancerScore {
|
||||
|
||||
88
harmony/src/modules/openbao/mod.rs
Normal file
88
harmony/src/modules/openbao/mod.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use harmony_macros::hurl;
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
interpret::Interpret,
|
||||
modules::helm::chart::{HelmChartScore, HelmRepository},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct OpenbaoScore {
|
||||
/// Host used for external access (ingress)
|
||||
pub host: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + HelmCommand> Score<T> for OpenbaoScore {
|
||||
fn name(&self) -> String {
|
||||
"OpenbaoScore".to_string()
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
// TODO exec pod commands to initialize secret store if not already done
|
||||
let host = &self.host;
|
||||
|
||||
let values_yaml = Some(format!(
|
||||
r#"global:
|
||||
openshift: true
|
||||
server:
|
||||
standalone:
|
||||
enabled: true
|
||||
config: |
|
||||
ui = true
|
||||
|
||||
listener "tcp" {{
|
||||
tls_disable = true
|
||||
address = "[::]:8200"
|
||||
cluster_address = "[::]:8201"
|
||||
}}
|
||||
|
||||
storage "file" {{
|
||||
path = "/openbao/data"
|
||||
}}
|
||||
|
||||
service:
|
||||
enabled: true
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
hosts:
|
||||
- host: {host}
|
||||
dataStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null
|
||||
accessMode: ReadWriteOnce
|
||||
|
||||
auditStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null
|
||||
accessMode: ReadWriteOnce
|
||||
ui:
|
||||
enabled: true"#
|
||||
));
|
||||
|
||||
HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str("openbao").unwrap()),
|
||||
release_name: NonBlankString::from_str("openbao").unwrap(),
|
||||
chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
|
||||
chart_version: None,
|
||||
values_overrides: None,
|
||||
values_yaml,
|
||||
create_namespace: true,
|
||||
install_only: false,
|
||||
repository: Some(HelmRepository::new(
|
||||
"openbao".to_string(),
|
||||
hurl!("https://openbao.github.io/openbao-helm"),
|
||||
true,
|
||||
)),
|
||||
}
|
||||
.create_interpret()
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use kube::{CustomResource, api::ObjectMeta};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -13,9 +15,14 @@ use serde::{Deserialize, Serialize};
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ClusterSpec {
|
||||
pub instances: u32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_name: Option<String>,
|
||||
pub storage: Storage,
|
||||
pub bootstrap: Bootstrap,
|
||||
/// This must be set to None if you want cnpg to generate a superuser secret
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub superuser_secret: Option<BTreeMap<String, String>>,
|
||||
pub enable_superuser_access: bool,
|
||||
}
|
||||
|
||||
impl Default for Cluster {
|
||||
@@ -34,6 +41,8 @@ impl Default for ClusterSpec {
|
||||
image_name: None,
|
||||
storage: Storage::default(),
|
||||
bootstrap: Bootstrap::default(),
|
||||
superuser_secret: None,
|
||||
enable_superuser_access: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::topology::{K8sclient, Topology};
|
||||
/// # Usage
|
||||
/// ```
|
||||
/// use harmony::modules::postgresql::CloudNativePgOperatorScore;
|
||||
/// let score = CloudNativePgOperatorScore::default();
|
||||
/// let score = CloudNativePgOperatorScore::default_openshift();
|
||||
/// ```
|
||||
///
|
||||
/// Or, you can take control of most relevant fiedls this way :
|
||||
@@ -52,8 +52,8 @@ pub struct CloudNativePgOperatorScore {
|
||||
pub source_namespace: String,
|
||||
}
|
||||
|
||||
impl Default for CloudNativePgOperatorScore {
|
||||
fn default() -> Self {
|
||||
impl CloudNativePgOperatorScore {
|
||||
pub fn default_openshift() -> Self {
|
||||
Self {
|
||||
namespace: "openshift-operators".to_string(),
|
||||
channel: "stable-v1".to_string(),
|
||||
@@ -68,7 +68,7 @@ impl CloudNativePgOperatorScore {
|
||||
pub fn new(namespace: &str) -> Self {
|
||||
Self {
|
||||
namespace: namespace.to_string(),
|
||||
..Default::default()
|
||||
..Self::default_openshift()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,7 +303,7 @@ impl<T: Topology + K8sclient + Send + Sync> Interpret<T> for PostgreSQLConnectio
|
||||
let port = self.get_port(&app_data)?;
|
||||
|
||||
// Create test script
|
||||
let script_path = self.create_test_script(temp_dir_path)?;
|
||||
let _ = self.create_test_script(temp_dir_path)?;
|
||||
|
||||
let ca_file_in_container = Path::new("/tmp").join(ca_file.file_name().unwrap());
|
||||
let script_cmd = format!(
|
||||
|
||||
@@ -66,6 +66,11 @@ impl<T: Topology + K8sclient> Score<T> for K8sPostgreSQLScore {
|
||||
owner: "app".to_string(),
|
||||
},
|
||||
},
|
||||
// superuser_secret: Some(BTreeMap::from([(
|
||||
// "name".to_string(),
|
||||
// format!("{}-superuser", self.config.cluster_name.clone()),
|
||||
// )])),
|
||||
enable_superuser_access: true,
|
||||
..ClusterSpec::default()
|
||||
};
|
||||
|
||||
|
||||
576
harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs
Normal file
576
harmony/src/modules/prometheus/k8s_prometheus_alerting_score.rs
Normal file
@@ -0,0 +1,576 @@
|
||||
use std::fs;
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use kube::api::ObjectMeta;
|
||||
use log::{debug, info};
|
||||
use serde::Serialize;
|
||||
use std::process::Command;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_alertmanager_config::CRDPrometheus;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_default_rules::build_default_application_rules;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_grafana::{
|
||||
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
|
||||
GrafanaDatasourceJsonData, GrafanaDatasourceSpec, GrafanaSpec,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::crd_prometheus_rules::{
|
||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::service_monitor::{
|
||||
ServiceMonitor, ServiceMonitorSpec,
|
||||
};
|
||||
use crate::topology::oberservability::monitoring::AlertReceiver;
|
||||
use crate::topology::{K8sclient, Topology};
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::monitoring::kube_prometheus::crd::{
|
||||
crd_alertmanagers::{Alertmanager, AlertmanagerSpec},
|
||||
crd_prometheuses::{
|
||||
AlertmanagerEndpoints, LabelSelector, Prometheus, PrometheusSpec,
|
||||
PrometheusSpecAlerting,
|
||||
},
|
||||
role::{build_prom_role, build_prom_rolebinding, build_prom_service_account},
|
||||
},
|
||||
score::Score,
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
use super::prometheus::PrometheusMonitoring;
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct K8sPrometheusCRDAlertingScore {
|
||||
pub sender: CRDPrometheus,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
pub prometheus_rules: Vec<RuleGroup>,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + PrometheusMonitoring<CRDPrometheus>> Score<T>
|
||||
for K8sPrometheusCRDAlertingScore
|
||||
{
|
||||
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
||||
Box::new(K8sPrometheusCRDAlertingInterpret {
|
||||
sender: self.sender.clone(),
|
||||
receivers: self.receivers.clone(),
|
||||
service_monitors: self.service_monitors.clone(),
|
||||
prometheus_rules: self.prometheus_rules.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"prometheus alerting [CRDAlertingScore]".into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct K8sPrometheusCRDAlertingInterpret {
|
||||
pub sender: CRDPrometheus,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
pub prometheus_rules: Vec<RuleGroup>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient + PrometheusMonitoring<CRDPrometheus>> Interpret<T>
|
||||
for K8sPrometheusCRDAlertingInterpret
|
||||
{
|
||||
async fn execute(
|
||||
&self,
|
||||
_inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let client = topology.k8s_client().await.unwrap();
|
||||
self.ensure_grafana_operator().await?;
|
||||
self.install_prometheus(&client).await?;
|
||||
self.install_alert_manager(&client).await?;
|
||||
self.install_client_kube_metrics().await?;
|
||||
self.install_grafana(&client).await?;
|
||||
self.install_receivers(&self.sender, &self.receivers)
|
||||
.await?;
|
||||
self.install_rules(&self.prometheus_rules, &client).await?;
|
||||
self.install_monitors(self.service_monitors.clone(), &client)
|
||||
.await?;
|
||||
Ok(Outcome::success(
|
||||
"K8s monitoring components installed".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::K8sPrometheusCrdAlerting
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl K8sPrometheusCRDAlertingInterpret {
|
||||
async fn crd_exists(&self, crd: &str) -> bool {
|
||||
let status = Command::new("sh")
|
||||
.args(["-c", &format!("kubectl get crd -A | grep -i {crd}")])
|
||||
.status()
|
||||
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))
|
||||
.unwrap();
|
||||
|
||||
status.success()
|
||||
}
|
||||
|
||||
async fn install_chart(
|
||||
&self,
|
||||
chart_path: String,
|
||||
chart_name: String,
|
||||
) -> Result<(), InterpretError> {
|
||||
let temp_dir =
|
||||
tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?;
|
||||
let temp_path = temp_dir.path().to_path_buf();
|
||||
debug!("Using temp directory: {}", temp_path.display());
|
||||
let chart = format!("{}/{}", chart_path, chart_name);
|
||||
let pull_output = Command::new("helm")
|
||||
.args(["pull", &chart, "--destination", temp_path.to_str().unwrap()])
|
||||
.output()
|
||||
.map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?;
|
||||
|
||||
if !pull_output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"Helm pull failed: {}",
|
||||
String::from_utf8_lossy(&pull_output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
let tgz_path = fs::read_dir(&temp_path)
|
||||
.unwrap()
|
||||
.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
let path = entry.path();
|
||||
if path.extension()? == "tgz" {
|
||||
Some(path)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.next()
|
||||
.ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?;
|
||||
|
||||
debug!("Installing chart from: {}", tgz_path.display());
|
||||
|
||||
let install_output = Command::new("helm")
|
||||
.args([
|
||||
"upgrade",
|
||||
"--install",
|
||||
&chart_name,
|
||||
tgz_path.to_str().unwrap(),
|
||||
"--namespace",
|
||||
&self.sender.namespace.clone(),
|
||||
"--create-namespace",
|
||||
"--wait",
|
||||
"--atomic",
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?;
|
||||
|
||||
if !install_output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"Helm install failed: {}",
|
||||
String::from_utf8_lossy(&install_output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Installed chart {}/{} in namespace: {}",
|
||||
&chart_path,
|
||||
&chart_name,
|
||||
self.sender.namespace.clone()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn ensure_grafana_operator(&self) -> Result<Outcome, InterpretError> {
|
||||
let _ = Command::new("helm")
|
||||
.args([
|
||||
"repo",
|
||||
"add",
|
||||
"grafana-operator",
|
||||
"https://grafana.github.io/helm-charts",
|
||||
])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
let _ = Command::new("helm")
|
||||
.args(["repo", "update"])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
let output = Command::new("helm")
|
||||
.args([
|
||||
"install",
|
||||
"grafana-operator",
|
||||
"grafana-operator/grafana-operator",
|
||||
"--namespace",
|
||||
&self.sender.namespace.clone(),
|
||||
"--create-namespace",
|
||||
"--set",
|
||||
"namespaceScope=true",
|
||||
])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"helm install failed:\nstdout: {}\nstderr: {}",
|
||||
String::from_utf8_lossy(&output.stdout),
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"installed grafana operator in ns {}",
|
||||
self.sender.namespace.clone()
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_prometheus(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
|
||||
debug!(
|
||||
"installing crd-prometheuses in namespace {}",
|
||||
self.sender.namespace.clone()
|
||||
);
|
||||
debug!("building role/rolebinding/serviceaccount for crd-prometheus");
|
||||
let rolename = format!("{}-prom", self.sender.namespace.clone());
|
||||
let sa_name = format!("{}-prom-sa", self.sender.namespace.clone());
|
||||
let role = build_prom_role(rolename.clone(), self.sender.namespace.clone());
|
||||
let rolebinding = build_prom_rolebinding(
|
||||
rolename.clone(),
|
||||
self.sender.namespace.clone(),
|
||||
sa_name.clone(),
|
||||
);
|
||||
let sa = build_prom_service_account(sa_name.clone(), self.sender.namespace.clone());
|
||||
let prom_spec = PrometheusSpec {
|
||||
alerting: Some(PrometheusSpecAlerting {
|
||||
alertmanagers: Some(vec![AlertmanagerEndpoints {
|
||||
name: Some("alertmanager-operated".into()),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
port: Some("web".into()),
|
||||
scheme: Some("http".into()),
|
||||
}]),
|
||||
}),
|
||||
service_account_name: sa_name.clone(),
|
||||
service_monitor_namespace_selector: Some(LabelSelector {
|
||||
match_labels: BTreeMap::from([(
|
||||
"kubernetes.io/metadata.name".to_string(),
|
||||
self.sender.namespace.clone(),
|
||||
)]),
|
||||
match_expressions: vec![],
|
||||
}),
|
||||
service_monitor_selector: Some(LabelSelector {
|
||||
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
service_discovery_role: Some("Endpoints".into()),
|
||||
|
||||
pod_monitor_selector: Some(LabelSelector {
|
||||
match_labels: BTreeMap::from([("client".to_string(), "prometheus".to_string())]),
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
rule_selector: Some(LabelSelector {
|
||||
match_labels: BTreeMap::from([("role".to_string(), "prometheus-rule".to_string())]),
|
||||
..Default::default()
|
||||
}),
|
||||
|
||||
rule_namespace_selector: Some(LabelSelector {
|
||||
match_labels: BTreeMap::from([(
|
||||
"kubernetes.io/metadata.name".to_string(),
|
||||
self.sender.namespace.clone(),
|
||||
)]),
|
||||
match_expressions: vec![],
|
||||
}),
|
||||
};
|
||||
let prom = Prometheus {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
("alertmanagerConfig".to_string(), "enabled".to_string()),
|
||||
("client".to_string(), "prometheus".to_string()),
|
||||
])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: prom_spec,
|
||||
};
|
||||
client
|
||||
.apply(&role, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
info!(
|
||||
"installed prometheus role: {:#?} in ns {:#?}",
|
||||
role.metadata.name.unwrap(),
|
||||
role.metadata.namespace.unwrap()
|
||||
);
|
||||
client
|
||||
.apply(&rolebinding, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
info!(
|
||||
"installed prometheus rolebinding: {:#?} in ns {:#?}",
|
||||
rolebinding.metadata.name.unwrap(),
|
||||
rolebinding.metadata.namespace.unwrap()
|
||||
);
|
||||
client
|
||||
.apply(&sa, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
info!(
|
||||
"installed prometheus service account: {:#?} in ns {:#?}",
|
||||
sa.metadata.name.unwrap(),
|
||||
sa.metadata.namespace.unwrap()
|
||||
);
|
||||
client
|
||||
.apply(&prom, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
info!(
|
||||
"installed prometheus: {:#?} in ns {:#?}",
|
||||
&prom.metadata.name.clone().unwrap(),
|
||||
&prom.metadata.namespace.clone().unwrap()
|
||||
);
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed crd-prometheus {:#?}",
|
||||
prom
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_alert_manager(
|
||||
&self,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let am = Alertmanager {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([(
|
||||
"alertmanagerConfig".to_string(),
|
||||
"enabled".to_string(),
|
||||
)])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: AlertmanagerSpec::default(),
|
||||
};
|
||||
client
|
||||
.apply(&am, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed service monitor {:#?}",
|
||||
am.metadata.name
|
||||
)))
|
||||
}
|
||||
async fn install_monitors(
|
||||
&self,
|
||||
mut monitors: Vec<ServiceMonitor>,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let default_service_monitor = ServiceMonitor {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
("alertmanagerConfig".to_string(), "enabled".to_string()),
|
||||
("client".to_string(), "prometheus".to_string()),
|
||||
(
|
||||
"app.kubernetes.io/name".to_string(),
|
||||
"kube-state-metrics".to_string(),
|
||||
),
|
||||
])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: ServiceMonitorSpec::default(),
|
||||
};
|
||||
monitors.push(default_service_monitor);
|
||||
for monitor in monitors.iter() {
|
||||
client
|
||||
.apply(monitor, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
}
|
||||
Ok(Outcome::success(
|
||||
"succesfully deployed service monitors".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn install_rules(
|
||||
&self,
|
||||
#[allow(clippy::ptr_arg)] rules: &Vec<RuleGroup>,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut prom_rule_spec = PrometheusRuleSpec {
|
||||
groups: rules.clone(),
|
||||
};
|
||||
|
||||
let default_rules_group = RuleGroup {
|
||||
name: "default-rules".to_string(),
|
||||
rules: build_default_application_rules(),
|
||||
};
|
||||
|
||||
prom_rule_spec.groups.push(default_rules_group);
|
||||
let prom_rules = PrometheusRule {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
("alertmanagerConfig".to_string(), "enabled".to_string()),
|
||||
("role".to_string(), "prometheus-rule".to_string()),
|
||||
])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: prom_rule_spec,
|
||||
};
|
||||
client
|
||||
.apply(&prom_rules, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed rules {:#?}",
|
||||
prom_rules.metadata.name
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_client_kube_metrics(&self) -> Result<Outcome, InterpretError> {
|
||||
self.install_chart(
|
||||
"oci://hub.nationtech.io/harmony".to_string(),
|
||||
"nt-kube-metrics".to_string(),
|
||||
)
|
||||
.await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"Installed client kube metrics in ns {}",
|
||||
&self.sender.namespace.clone()
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_grafana(&self, client: &Arc<K8sClient>) -> Result<Outcome, InterpretError> {
|
||||
let mut label = BTreeMap::new();
|
||||
label.insert("dashboards".to_string(), "grafana".to_string());
|
||||
let labels = LabelSelector {
|
||||
match_labels: label.clone(),
|
||||
match_expressions: vec![],
|
||||
};
|
||||
let namespace = self.sender.namespace.clone();
|
||||
let json_data = GrafanaDatasourceJsonData {
|
||||
time_interval: Some("5s".to_string()),
|
||||
http_header_name1: None,
|
||||
tls_skip_verify: Some(true),
|
||||
oauth_pass_thru: Some(true),
|
||||
};
|
||||
let json = build_default_dashboard(&namespace);
|
||||
|
||||
let graf_data_source = GrafanaDatasource {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!(
|
||||
"grafana-datasource-{}",
|
||||
self.sender.namespace.clone()
|
||||
)),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaDatasourceSpec {
|
||||
instance_selector: labels.clone(),
|
||||
allow_cross_namespace_import: Some(false),
|
||||
datasource: GrafanaDatasourceConfig {
|
||||
access: "proxy".to_string(),
|
||||
database: Some("prometheus".to_string()),
|
||||
json_data: Some(json_data),
|
||||
//this is fragile
|
||||
name: format!("prometheus-{}-0", self.sender.namespace.clone()),
|
||||
r#type: "prometheus".to_string(),
|
||||
url: format!(
|
||||
"http://prometheus-operated.{}.svc.cluster.local:9090",
|
||||
self.sender.namespace.clone()
|
||||
),
|
||||
secure_json_data: None,
|
||||
is_default: None,
|
||||
editable: None,
|
||||
},
|
||||
values_from: None,
|
||||
},
|
||||
};
|
||||
|
||||
client
|
||||
.apply(&graf_data_source, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
let graf_dashboard = GrafanaDashboard {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!(
|
||||
"grafana-dashboard-{}",
|
||||
self.sender.namespace.clone()
|
||||
)),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaDashboardSpec {
|
||||
resync_period: Some("30s".to_string()),
|
||||
instance_selector: labels.clone(),
|
||||
json: Some(json),
|
||||
grafana_com: None,
|
||||
datasources: None,
|
||||
},
|
||||
};
|
||||
|
||||
client
|
||||
.apply(&graf_dashboard, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
let grafana = Grafana {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!("grafana-{}", self.sender.namespace.clone())),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
labels: Some(label.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaSpec {
|
||||
config: None,
|
||||
admin_user: None,
|
||||
admin_password: None,
|
||||
ingress: None,
|
||||
persistence: None,
|
||||
resources: None,
|
||||
},
|
||||
};
|
||||
client
|
||||
.apply(&grafana, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed grafana instance {:#?}",
|
||||
grafana.metadata.name
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_receivers(
|
||||
&self,
|
||||
sender: &CRDPrometheus,
|
||||
receivers: &Vec<Box<dyn AlertReceiver<CRDPrometheus>>>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
for receiver in receivers.iter() {
|
||||
receiver.install(sender).await.map_err(|err| {
|
||||
InterpretError::new(format!("failed to install receiver: {}", err))
|
||||
})?;
|
||||
}
|
||||
Ok(Outcome::success("successfully deployed receivers".into()))
|
||||
}
|
||||
}
|
||||
528
harmony/src/modules/prometheus/rhob_alerting_score.rs
Normal file
528
harmony/src/modules/prometheus/rhob_alerting_score.rs
Normal file
@@ -0,0 +1,528 @@
|
||||
use fqdn::fqdn;
|
||||
use std::fs;
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use kube::api::ObjectMeta;
|
||||
use log::{debug, info};
|
||||
use serde::Serialize;
|
||||
use std::process::Command;
|
||||
|
||||
use crate::modules::k8s::ingress::{K8sIngressScore, PathType};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::grafana_default_dashboard::build_default_dashboard;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_alertmanager_config::RHOBObservability;
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_grafana::{
|
||||
Grafana, GrafanaDashboard, GrafanaDashboardSpec, GrafanaDatasource, GrafanaDatasourceConfig,
|
||||
GrafanaDatasourceSpec, GrafanaSpec,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_monitoring_stack::{
|
||||
MonitoringStack, MonitoringStackSpec,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheus_rules::{
|
||||
PrometheusRule, PrometheusRuleSpec, RuleGroup,
|
||||
};
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_prometheuses::LabelSelector;
|
||||
|
||||
use crate::modules::monitoring::kube_prometheus::crd::rhob_service_monitor::{
|
||||
ServiceMonitor, ServiceMonitorSpec,
|
||||
};
|
||||
use crate::score::Score;
|
||||
use crate::topology::ingress::Ingress;
|
||||
use crate::topology::oberservability::monitoring::AlertReceiver;
|
||||
use crate::topology::{K8sclient, Topology};
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
use super::prometheus::PrometheusMonitoring;
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct RHOBAlertingScore {
|
||||
pub sender: RHOBObservability,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
pub prometheus_rules: Vec<RuleGroup>,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + Ingress + PrometheusMonitoring<RHOBObservability>> Score<T>
|
||||
for RHOBAlertingScore
|
||||
{
|
||||
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
|
||||
Box::new(RHOBAlertingInterpret {
|
||||
sender: self.sender.clone(),
|
||||
receivers: self.receivers.clone(),
|
||||
service_monitors: self.service_monitors.clone(),
|
||||
prometheus_rules: self.prometheus_rules.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"RHOB alerting [RHOBAlertingScore]".into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RHOBAlertingInterpret {
|
||||
pub sender: RHOBObservability,
|
||||
pub receivers: Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
|
||||
pub service_monitors: Vec<ServiceMonitor>,
|
||||
pub prometheus_rules: Vec<RuleGroup>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient + Ingress + PrometheusMonitoring<RHOBObservability>> Interpret<T>
|
||||
for RHOBAlertingInterpret
|
||||
{
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let client = topology.k8s_client().await.unwrap();
|
||||
self.ensure_grafana_operator().await?;
|
||||
self.install_prometheus(inventory, topology, &client)
|
||||
.await?;
|
||||
self.install_client_kube_metrics().await?;
|
||||
self.install_grafana(inventory, topology, &client).await?;
|
||||
self.install_receivers(&self.sender, &self.receivers)
|
||||
.await?;
|
||||
self.install_rules(&self.prometheus_rules, &client).await?;
|
||||
self.install_monitors(self.service_monitors.clone(), &client)
|
||||
.await?;
|
||||
Ok(Outcome::success(
|
||||
"K8s monitoring components installed".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::RHOBAlerting
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl RHOBAlertingInterpret {
|
||||
async fn crd_exists(&self, crd: &str) -> bool {
|
||||
let status = Command::new("sh")
|
||||
.args(["-c", &format!("kubectl get crd -A | grep -i {crd}")])
|
||||
.status()
|
||||
.map_err(|e| InterpretError::new(format!("could not connect to cluster: {}", e)))
|
||||
.unwrap();
|
||||
|
||||
status.success()
|
||||
}
|
||||
|
||||
async fn install_chart(
|
||||
&self,
|
||||
chart_path: String,
|
||||
chart_name: String,
|
||||
) -> Result<(), InterpretError> {
|
||||
let temp_dir =
|
||||
tempdir().map_err(|e| InterpretError::new(format!("Tempdir error: {}", e)))?;
|
||||
let temp_path = temp_dir.path().to_path_buf();
|
||||
debug!("Using temp directory: {}", temp_path.display());
|
||||
let chart = format!("{}/{}", chart_path, chart_name);
|
||||
let pull_output = Command::new("helm")
|
||||
.args(["pull", &chart, "--destination", temp_path.to_str().unwrap()])
|
||||
.output()
|
||||
.map_err(|e| InterpretError::new(format!("Helm pull error: {}", e)))?;
|
||||
|
||||
if !pull_output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"Helm pull failed: {}",
|
||||
String::from_utf8_lossy(&pull_output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
let tgz_path = fs::read_dir(&temp_path)
|
||||
.unwrap()
|
||||
.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
let path = entry.path();
|
||||
if path.extension()? == "tgz" {
|
||||
Some(path)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.next()
|
||||
.ok_or_else(|| InterpretError::new("Could not find pulled Helm chart".into()))?;
|
||||
|
||||
debug!("Installing chart from: {}", tgz_path.display());
|
||||
|
||||
let install_output = Command::new("helm")
|
||||
.args([
|
||||
"upgrade",
|
||||
"--install",
|
||||
&chart_name,
|
||||
tgz_path.to_str().unwrap(),
|
||||
"--namespace",
|
||||
&self.sender.namespace.clone(),
|
||||
"--create-namespace",
|
||||
"--wait",
|
||||
"--atomic",
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| InterpretError::new(format!("Helm install error: {}", e)))?;
|
||||
|
||||
if !install_output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"Helm install failed: {}",
|
||||
String::from_utf8_lossy(&install_output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Installed chart {}/{} in namespace: {}",
|
||||
&chart_path,
|
||||
&chart_name,
|
||||
self.sender.namespace.clone()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn ensure_grafana_operator(&self) -> Result<Outcome, InterpretError> {
|
||||
let _ = Command::new("helm")
|
||||
.args([
|
||||
"repo",
|
||||
"add",
|
||||
"grafana-operator",
|
||||
"https://grafana.github.io/helm-charts",
|
||||
])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
let _ = Command::new("helm")
|
||||
.args(["repo", "update"])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
let output = Command::new("helm")
|
||||
.args([
|
||||
"upgrade",
|
||||
"--install",
|
||||
"grafana-operator",
|
||||
"grafana-operator/grafana-operator",
|
||||
"--namespace",
|
||||
&self.sender.namespace.clone(),
|
||||
"--create-namespace",
|
||||
"--set",
|
||||
"namespaceScope=true",
|
||||
])
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(InterpretError::new(format!(
|
||||
"helm upgrade --install failed:\nstdout: {}\nstderr: {}",
|
||||
String::from_utf8_lossy(&output.stdout),
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(Outcome::success(format!(
|
||||
"installed grafana operator in ns {}",
|
||||
self.sender.namespace.clone()
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_prometheus<T: Topology + K8sclient + Ingress>(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
debug!(
|
||||
"installing crd-prometheuses in namespace {}",
|
||||
self.sender.namespace.clone()
|
||||
);
|
||||
debug!("building role/rolebinding/serviceaccount for crd-prometheus");
|
||||
|
||||
let stack = MonitoringStack {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!("{}-monitoring", self.sender.namespace.clone()).into()),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
labels: Some([("monitoring-stack".into(), "true".into())].into()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: MonitoringStackSpec {
|
||||
log_level: Some("debug".into()),
|
||||
retention: Some("1d".into()),
|
||||
resource_selector: Some(LabelSelector {
|
||||
match_labels: Default::default(),
|
||||
match_expressions: vec![],
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
client
|
||||
.apply(&stack, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
let alert_manager_domain = topology
|
||||
.get_domain(&format!("alert-manager-{}", self.sender.namespace.clone()))
|
||||
.await?;
|
||||
let name = format!("{}-alert-manager", self.sender.namespace.clone());
|
||||
let backend_service = format!("alertmanager-operated");
|
||||
let namespace = self.sender.namespace.clone();
|
||||
let alert_manager_ingress = K8sIngressScore {
|
||||
name: fqdn!(&name),
|
||||
host: fqdn!(&alert_manager_domain),
|
||||
backend_service: fqdn!(&backend_service),
|
||||
port: 9093,
|
||||
path: Some("/".to_string()),
|
||||
path_type: Some(PathType::Prefix),
|
||||
namespace: Some(fqdn!(&namespace)),
|
||||
ingress_class_name: Some("openshift-default".to_string()),
|
||||
};
|
||||
|
||||
let prometheus_domain = topology
|
||||
.get_domain(&format!("prometheus-{}", self.sender.namespace.clone()))
|
||||
.await?;
|
||||
let name = format!("{}-prometheus", self.sender.namespace.clone());
|
||||
let backend_service = format!("prometheus-operated");
|
||||
let prometheus_ingress = K8sIngressScore {
|
||||
name: fqdn!(&name),
|
||||
host: fqdn!(&prometheus_domain),
|
||||
backend_service: fqdn!(&backend_service),
|
||||
port: 9090,
|
||||
path: Some("/".to_string()),
|
||||
path_type: Some(PathType::Prefix),
|
||||
namespace: Some(fqdn!(&namespace)),
|
||||
ingress_class_name: Some("openshift-default".to_string()),
|
||||
};
|
||||
|
||||
alert_manager_ingress.interpret(inventory, topology).await?;
|
||||
prometheus_ingress.interpret(inventory, topology).await?;
|
||||
info!("installed rhob monitoring stack",);
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed rhob-prometheus {:#?}",
|
||||
stack
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_monitors(
|
||||
&self,
|
||||
mut monitors: Vec<ServiceMonitor>,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let default_service_monitor = ServiceMonitor {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
("alertmanagerConfig".to_string(), "enabled".to_string()),
|
||||
("client".to_string(), "prometheus".to_string()),
|
||||
(
|
||||
"app.kubernetes.io/name".to_string(),
|
||||
"kube-state-metrics".to_string(),
|
||||
),
|
||||
])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: ServiceMonitorSpec::default(),
|
||||
};
|
||||
monitors.push(default_service_monitor);
|
||||
for monitor in monitors.iter() {
|
||||
client
|
||||
.apply(monitor, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
}
|
||||
Ok(Outcome::success(
|
||||
"succesfully deployed service monitors".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn install_rules(
|
||||
&self,
|
||||
#[allow(clippy::ptr_arg)] rules: &Vec<RuleGroup>,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut prom_rule_spec = PrometheusRuleSpec {
|
||||
groups: rules.clone(),
|
||||
};
|
||||
|
||||
let default_rules_group = RuleGroup {
|
||||
name: "default-rules".to_string(),
|
||||
rules: crate::modules::monitoring::kube_prometheus::crd::rhob_default_rules::build_default_application_rules(),
|
||||
};
|
||||
|
||||
prom_rule_spec.groups.push(default_rules_group);
|
||||
let prom_rules = PrometheusRule {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.sender.namespace.clone()),
|
||||
labels: Some(std::collections::BTreeMap::from([
|
||||
("alertmanagerConfig".to_string(), "enabled".to_string()),
|
||||
("role".to_string(), "prometheus-rule".to_string()),
|
||||
])),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: prom_rule_spec,
|
||||
};
|
||||
client
|
||||
.apply(&prom_rules, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed rules {:#?}",
|
||||
prom_rules.metadata.name
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_client_kube_metrics(&self) -> Result<Outcome, InterpretError> {
|
||||
self.install_chart(
|
||||
"oci://hub.nationtech.io/harmony".to_string(),
|
||||
"nt-kube-metrics".to_string(),
|
||||
)
|
||||
.await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"Installed client kube metrics in ns {}",
|
||||
&self.sender.namespace.clone()
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_grafana<T: Topology + K8sclient + Ingress>(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
client: &Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let mut label = BTreeMap::new();
|
||||
label.insert("dashboards".to_string(), "grafana".to_string());
|
||||
let labels = LabelSelector {
|
||||
match_labels: label.clone(),
|
||||
match_expressions: vec![],
|
||||
};
|
||||
let mut json_data = BTreeMap::new();
|
||||
json_data.insert("timeInterval".to_string(), "5s".to_string());
|
||||
let namespace = self.sender.namespace.clone();
|
||||
|
||||
let json = build_default_dashboard(&namespace);
|
||||
|
||||
let graf_data_source = GrafanaDatasource {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!(
|
||||
"grafana-datasource-{}",
|
||||
self.sender.namespace.clone()
|
||||
)),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaDatasourceSpec {
|
||||
instance_selector: labels.clone(),
|
||||
allow_cross_namespace_import: Some(false),
|
||||
datasource: GrafanaDatasourceConfig {
|
||||
access: "proxy".to_string(),
|
||||
database: Some("prometheus".to_string()),
|
||||
json_data: Some(json_data),
|
||||
//this is fragile
|
||||
name: format!("prometheus-{}-0", self.sender.namespace.clone()),
|
||||
r#type: "prometheus".to_string(),
|
||||
url: format!(
|
||||
"http://prometheus-operated.{}.svc.cluster.local:9090",
|
||||
self.sender.namespace.clone()
|
||||
),
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
client
|
||||
.apply(&graf_data_source, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
let graf_dashboard = GrafanaDashboard {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!(
|
||||
"grafana-dashboard-{}",
|
||||
self.sender.namespace.clone()
|
||||
)),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaDashboardSpec {
|
||||
resync_period: Some("30s".to_string()),
|
||||
instance_selector: labels.clone(),
|
||||
json,
|
||||
},
|
||||
};
|
||||
|
||||
client
|
||||
.apply(&graf_dashboard, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
|
||||
let grafana = Grafana {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(format!("grafana-{}", self.sender.namespace.clone())),
|
||||
namespace: Some(self.sender.namespace.clone()),
|
||||
labels: Some(label.clone()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: GrafanaSpec {
|
||||
config: None,
|
||||
admin_user: None,
|
||||
admin_password: None,
|
||||
ingress: None,
|
||||
persistence: None,
|
||||
resources: None,
|
||||
},
|
||||
};
|
||||
client
|
||||
.apply(&grafana, Some(&self.sender.namespace.clone()))
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(e.to_string()))?;
|
||||
let domain = topology
|
||||
.get_domain(&format!("grafana-{}", self.sender.namespace.clone()))
|
||||
.await?;
|
||||
let name = format!("{}-grafana", self.sender.namespace.clone());
|
||||
let backend_service = format!("grafana-{}-service", self.sender.namespace.clone());
|
||||
let grafana_ingress = K8sIngressScore {
|
||||
name: fqdn!(&name),
|
||||
host: fqdn!(&domain),
|
||||
backend_service: fqdn!(&backend_service),
|
||||
port: 3000,
|
||||
path: Some("/".to_string()),
|
||||
path_type: Some(PathType::Prefix),
|
||||
namespace: Some(fqdn!(&namespace)),
|
||||
ingress_class_name: Some("openshift-default".to_string()),
|
||||
};
|
||||
|
||||
grafana_ingress.interpret(inventory, topology).await?;
|
||||
Ok(Outcome::success(format!(
|
||||
"successfully deployed grafana instance {:#?}",
|
||||
grafana.metadata.name
|
||||
)))
|
||||
}
|
||||
|
||||
async fn install_receivers(
|
||||
&self,
|
||||
sender: &RHOBObservability,
|
||||
receivers: &Vec<Box<dyn AlertReceiver<RHOBObservability>>>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
for receiver in receivers.iter() {
|
||||
receiver.install(sender).await.map_err(|err| {
|
||||
InterpretError::new(format!("failed to install receiver: {}", err))
|
||||
})?;
|
||||
}
|
||||
Ok(Outcome::success("successfully deployed receivers".into()))
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ use std::{
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_k8s::K8sClient;
|
||||
use log::{debug, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::sleep;
|
||||
@@ -13,7 +14,7 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use harmony_types::id::Id;
|
||||
|
||||
|
||||
@@ -9,8 +9,9 @@ use crate::{
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
topology::{K8sclient, Topology},
|
||||
};
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony_types::id::Id;
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
||||
518
harmony/src/modules/zitadel/mod.rs
Normal file
518
harmony/src/modules/zitadel/mod.rs
Normal file
@@ -0,0 +1,518 @@
|
||||
use k8s_openapi::api::core::v1::Namespace;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use k8s_openapi::{ByteString, api::core::v1::Secret};
|
||||
use kube::{Error as KubeError, core::ErrorResponse};
|
||||
use rand::distr::Distribution;
|
||||
use rand::{Rng, rng, seq::SliceRandom};
|
||||
use std::collections::BTreeMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use harmony_macros::hurl;
|
||||
use harmony_types::id::Id;
|
||||
use harmony_types::storage::StorageSize;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use non_blank_string_rs::NonBlankString;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
data::Version,
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
modules::helm::chart::{HelmChartScore, HelmRepository},
|
||||
modules::k8s::resource::K8sResourceScore,
|
||||
modules::postgresql::capability::{PostgreSQL, PostgreSQLClusterRole, PostgreSQLConfig},
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
const NAMESPACE: &str = "zitadel";
|
||||
const PG_CLUSTER_NAME: &str = "zitadel-pg";
|
||||
const MASTERKEY_SECRET_NAME: &str = "zitadel-masterkey";
|
||||
|
||||
/// Opinionated Zitadel deployment score.
|
||||
///
|
||||
/// Deploys a PostgreSQL cluster (via the [`PostgreSQL`] trait) and the Zitadel
|
||||
/// Helm chart into the same namespace. Intended as a central multi-tenant IdP
|
||||
/// with SSO for OKD/OpenShift, OpenBao, Harbor, Grafana, Nextcloud, Ente
|
||||
/// Photos, and others.
|
||||
///
|
||||
/// # Ingress annotations
|
||||
/// No controller-specific ingress annotations are set by default. On
|
||||
/// OKD/OpenShift, the ingress should request TLS so the generated Route is
|
||||
/// edge-terminated instead of HTTP-only. Optional cert-manager annotations are
|
||||
/// included for clusters that have cert-manager installed; clusters without
|
||||
/// cert-manager will ignore them.
|
||||
/// Add or adjust annotations via `values_overrides` depending on your
|
||||
/// distribution:
|
||||
/// - NGINX: `nginx.ingress.kubernetes.io/backend-protocol: GRPC`
|
||||
/// - OpenShift HAProxy: `route.openshift.io/termination: edge`
|
||||
/// - AWS ALB: set `ingress.controller: aws`
|
||||
|
||||
///
|
||||
/// # Database credentials
|
||||
/// CNPG creates a `<cluster>-superuser` secret with key `password`. Because
|
||||
/// `envVarsSecret` injects secret keys verbatim as env var names and the CNPG
|
||||
/// key (`password`) does not match ZITADEL's expected name
|
||||
/// (`ZITADEL_DATABASE_POSTGRES_USER_PASSWORD`), individual `env` entries with
|
||||
/// `valueFrom.secretKeyRef` are used instead. For environments with an
|
||||
/// External Secrets Operator or similar, create a dedicated secret with the
|
||||
/// correct ZITADEL env var names and switch to `envVarsSecret`.
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct ZitadelScore {
|
||||
/// External domain (e.g. `"auth.example.com"`).
|
||||
pub host: String,
|
||||
pub zitadel_version: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Score<T> for ZitadelScore {
|
||||
fn name(&self) -> String {
|
||||
"ZitadelScore".to_string()
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(ZitadelInterpret {
|
||||
host: self.host.clone(),
|
||||
zitadel_version: self.zitadel_version.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ZitadelInterpret {
|
||||
host: String,
|
||||
zitadel_version: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient + HelmCommand + PostgreSQL> Interpret<T> for ZitadelInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
info!(
|
||||
"[Zitadel] Starting full deployment — namespace: '{NAMESPACE}', host: '{}'",
|
||||
self.host
|
||||
);
|
||||
|
||||
info!("Creating namespace {NAMESPACE} if it does not exist");
|
||||
K8sResourceScore::single(
|
||||
Namespace {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(NAMESPACE.to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
None,
|
||||
)
|
||||
.interpret(inventory, topology)
|
||||
.await?;
|
||||
|
||||
// --- Step 1: PostgreSQL -------------------------------------------
|
||||
|
||||
let pg_config = PostgreSQLConfig {
|
||||
cluster_name: PG_CLUSTER_NAME.to_string(),
|
||||
instances: 2,
|
||||
storage_size: StorageSize::gi(10),
|
||||
role: PostgreSQLClusterRole::Primary,
|
||||
namespace: NAMESPACE.to_string(),
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Deploying PostgreSQL cluster '{}' — instances: {}, storage: 10Gi, namespace: '{}'",
|
||||
pg_config.cluster_name, pg_config.instances, pg_config.namespace
|
||||
);
|
||||
|
||||
topology.deploy(&pg_config).await.map_err(|e| {
|
||||
let msg = format!(
|
||||
"[Zitadel] PostgreSQL deployment failed for '{}': {e}",
|
||||
pg_config.cluster_name
|
||||
);
|
||||
error!("{msg}");
|
||||
InterpretError::new(msg)
|
||||
})?;
|
||||
|
||||
info!(
|
||||
"[Zitadel] PostgreSQL cluster '{}' deployed",
|
||||
pg_config.cluster_name
|
||||
);
|
||||
|
||||
// --- Step 2: Resolve internal DB endpoint -------------------------
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Resolving internal endpoint for cluster '{}'",
|
||||
pg_config.cluster_name
|
||||
);
|
||||
|
||||
let endpoint = topology.get_endpoint(&pg_config).await.map_err(|e| {
|
||||
let msg = format!(
|
||||
"[Zitadel] Failed to resolve endpoint for cluster '{}': {e}",
|
||||
pg_config.cluster_name
|
||||
);
|
||||
error!("{msg}");
|
||||
InterpretError::new(msg)
|
||||
})?;
|
||||
|
||||
info!(
|
||||
"[Zitadel] DB endpoint resolved — host: '{}', port: {}",
|
||||
endpoint.host, endpoint.port
|
||||
);
|
||||
|
||||
// The CNPG-managed superuser secret contains 'password', 'username',
|
||||
// 'host', 'port', 'dbname', 'uri'. We reference 'password' directly
|
||||
// via env.valueFrom.secretKeyRef because CNPG's key names do not
|
||||
// match ZITADEL's required env var names.
|
||||
let pg_user_secret = format!("{PG_CLUSTER_NAME}-app");
|
||||
let pg_superuser_secret = format!("{PG_CLUSTER_NAME}-superuser");
|
||||
let db_host = &endpoint.host;
|
||||
let db_port = endpoint.port;
|
||||
let host = &self.host;
|
||||
|
||||
debug!("[Zitadel] DB credentials source — secret: '{pg_user_secret}', key: 'password'");
|
||||
debug!(
|
||||
"[Zitadel] DB credentials source — superuser secret: '{pg_superuser_secret}', key: 'password'"
|
||||
);
|
||||
|
||||
// Zitadel requires one symbol, one number and more. So let's force it.
|
||||
fn generate_secure_password(length: usize) -> String {
|
||||
const ALPHA_UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
const ALPHA_LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
|
||||
const DIGITS: &[u8] = b"0123456789";
|
||||
const SYMBOLS: &[u8] = b"!@#$%^&*()_+-=[]{}|;:',.<>?/";
|
||||
|
||||
let mut rng = rand::rng();
|
||||
let uniform_alpha_upper = rand::distr::Uniform::new(0, ALPHA_UPPER.len())
|
||||
.expect("Failed to create distribution");
|
||||
let uniform_alpha_lower = rand::distr::Uniform::new(0, ALPHA_LOWER.len())
|
||||
.expect("Failed to create distribution");
|
||||
let uniform_digits =
|
||||
rand::distr::Uniform::new(0, DIGITS.len()).expect("Failed to create distribution");
|
||||
let uniform_symbols =
|
||||
rand::distr::Uniform::new(0, SYMBOLS.len()).expect("Failed to create distribution");
|
||||
|
||||
let mut chars: Vec<char> = Vec::with_capacity(length);
|
||||
|
||||
// Ensure at least one of each: upper, lower, digit, symbol
|
||||
chars.push(ALPHA_UPPER[uniform_alpha_upper.sample(&mut rng)] as char);
|
||||
chars.push(ALPHA_LOWER[uniform_alpha_lower.sample(&mut rng)] as char);
|
||||
chars.push(DIGITS[uniform_digits.sample(&mut rng)] as char);
|
||||
chars.push(SYMBOLS[uniform_symbols.sample(&mut rng)] as char);
|
||||
|
||||
// Fill remaining with random from all categories
|
||||
let all_chars: Vec<u8> = [ALPHA_UPPER, ALPHA_LOWER, DIGITS, SYMBOLS].concat();
|
||||
|
||||
let uniform_all = rand::distr::Uniform::new(0, all_chars.len())
|
||||
.expect("Failed to create distribution");
|
||||
|
||||
for _ in 0..(length - 4) {
|
||||
chars.push(all_chars[uniform_all.sample(&mut rng)] as char);
|
||||
}
|
||||
|
||||
// Shuffle
|
||||
let mut shuffled = chars;
|
||||
shuffled.shuffle(&mut rng);
|
||||
|
||||
return shuffled.iter().collect();
|
||||
}
|
||||
|
||||
let admin_password = generate_secure_password(16);
|
||||
|
||||
// --- Step 3: Create masterkey secret ------------------------------------
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Creating masterkey secret '{}' in namespace '{}'",
|
||||
MASTERKEY_SECRET_NAME, NAMESPACE
|
||||
);
|
||||
|
||||
// Masterkey for symmetric encryption — must be exactly 32 ASCII bytes (alphanumeric only).
|
||||
let masterkey = rng()
|
||||
.sample_iter(&rand::distr::Alphanumeric)
|
||||
.take(32)
|
||||
.map(char::from)
|
||||
.collect::<String>();
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Created masterkey secret '{}' in namespace '{}'",
|
||||
MASTERKEY_SECRET_NAME, NAMESPACE
|
||||
);
|
||||
|
||||
let mut masterkey_data: BTreeMap<String, ByteString> = BTreeMap::new();
|
||||
masterkey_data.insert("masterkey".to_string(), ByteString(masterkey.into()));
|
||||
|
||||
let masterkey_secret = Secret {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(MASTERKEY_SECRET_NAME.to_string()),
|
||||
namespace: Some(NAMESPACE.to_string()),
|
||||
..ObjectMeta::default()
|
||||
},
|
||||
data: Some(masterkey_data),
|
||||
..Secret::default()
|
||||
};
|
||||
|
||||
match topology
|
||||
.k8s_client()
|
||||
.await
|
||||
.map_err(|e| InterpretError::new(format!("Failed to get k8s client : {e}")))?
|
||||
.create(&masterkey_secret, Some(NAMESPACE))
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"[Zitadel] Masterkey secret '{}' created",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
}
|
||||
Err(KubeError::Api(ErrorResponse { code: 409, .. })) => {
|
||||
info!(
|
||||
"[Zitadel] Masterkey secret '{}' already exists, leaving it untouched",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
}
|
||||
Err(other) => {
|
||||
let msg = format!(
|
||||
"[Zitadel] Failed to create masterkey secret '{}': {other}",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
error!("{msg}");
|
||||
return Err(InterpretError::new(msg));
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[Zitadel] Masterkey secret '{}' created successfully",
|
||||
MASTERKEY_SECRET_NAME
|
||||
);
|
||||
|
||||
// --- Step 4: Build Helm values ------------------------------------
|
||||
|
||||
warn!(
|
||||
"[Zitadel] Applying TLS-enabled ingress defaults for OKD/OpenShift. \
|
||||
cert-manager annotations are included as optional hints and are \
|
||||
ignored on clusters without cert-manager."
|
||||
);
|
||||
|
||||
let values_yaml = format!(
|
||||
r#"image:
|
||||
tag: {zitadel_version}
|
||||
zitadel:
|
||||
masterkeySecretName: "{MASTERKEY_SECRET_NAME}"
|
||||
configmapConfig:
|
||||
ExternalDomain: "{host}"
|
||||
ExternalSecure: true
|
||||
FirstInstance:
|
||||
Org:
|
||||
Human:
|
||||
UserName: "admin"
|
||||
Password: "{admin_password}"
|
||||
FirstName: "Zitadel"
|
||||
LastName: "Admin"
|
||||
Email: "admin@zitadel.example.com"
|
||||
PasswordChangeRequired: true
|
||||
TLS:
|
||||
Enabled: false
|
||||
Database:
|
||||
Postgres:
|
||||
Host: "{db_host}"
|
||||
Port: {db_port}
|
||||
Database: zitadel
|
||||
MaxOpenConns: 20
|
||||
MaxIdleConns: 10
|
||||
User:
|
||||
Username: postgres
|
||||
SSL:
|
||||
Mode: require
|
||||
Admin:
|
||||
Username: postgres
|
||||
SSL:
|
||||
Mode: require
|
||||
# Directly import credentials from the postgres secret
|
||||
# TODO : use a less privileged postgres user
|
||||
env:
|
||||
- name: ZITADEL_DATABASE_POSTGRES_USER_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "{pg_superuser_secret}"
|
||||
key: user
|
||||
- name: ZITADEL_DATABASE_POSTGRES_USER_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "{pg_superuser_secret}"
|
||||
key: password
|
||||
- name: ZITADEL_DATABASE_POSTGRES_ADMIN_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "{pg_superuser_secret}"
|
||||
key: user
|
||||
- name: ZITADEL_DATABASE_POSTGRES_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: "{pg_superuser_secret}"
|
||||
key: password
|
||||
# Security context for OpenShift restricted PSA compliance
|
||||
podSecurityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
# Init job security context (runs before main deployment)
|
||||
initJob:
|
||||
podSecurityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
# Setup job security context
|
||||
setupJob:
|
||||
podSecurityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
route.openshift.io/termination: edge
|
||||
hosts:
|
||||
- host: "{host}"
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- "{host}"
|
||||
secretName: "{host}-tls"
|
||||
|
||||
login:
|
||||
enabled: true
|
||||
podSecurityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
runAsNonRoot: true
|
||||
runAsUser: null
|
||||
fsGroup: null
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
ingress:
|
||||
enabled: true
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
route.openshift.io/termination: edge
|
||||
hosts:
|
||||
- host: "{host}"
|
||||
paths:
|
||||
- path: /ui/v2/login
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- "{host}"
|
||||
secretName: "{host}-tls""#,
|
||||
zitadel_version = self.zitadel_version
|
||||
);
|
||||
|
||||
trace!("[Zitadel] Helm values YAML:\n{values_yaml}");
|
||||
|
||||
// --- Step 5: Deploy Helm chart ------------------------------------
|
||||
|
||||
info!(
|
||||
"[Zitadel] Deploying Helm chart 'zitadel/zitadel' as release 'zitadel' in namespace '{NAMESPACE}'"
|
||||
);
|
||||
|
||||
let result = HelmChartScore {
|
||||
namespace: Some(NonBlankString::from_str(NAMESPACE).unwrap()),
|
||||
release_name: NonBlankString::from_str("zitadel").unwrap(),
|
||||
chart_name: NonBlankString::from_str("zitadel/zitadel").unwrap(),
|
||||
chart_version: None,
|
||||
values_overrides: None,
|
||||
values_yaml: Some(values_yaml),
|
||||
create_namespace: true,
|
||||
install_only: false,
|
||||
repository: Some(HelmRepository::new(
|
||||
"zitadel".to_string(),
|
||||
hurl!("https://charts.zitadel.com"),
|
||||
true,
|
||||
)),
|
||||
}
|
||||
.interpret(inventory, topology)
|
||||
.await;
|
||||
|
||||
match &result {
|
||||
Ok(_) => info!(
|
||||
"[Zitadel] Helm chart deployed successfully\n\n\
|
||||
===== ZITADEL DEPLOYMENT COMPLETE =====\n\
|
||||
Login URL: https://{host}\n\
|
||||
Username: admin@zitadel.{host}\n\
|
||||
Password: {admin_password}\n\n\
|
||||
IMPORTANT: The password is saved in ConfigMap 'zitadel-config-yaml'\n\
|
||||
and must be changed on first login. Save the credentials in a\n\
|
||||
secure location after changing them.\n\
|
||||
========================================="
|
||||
),
|
||||
Err(e) => error!("[Zitadel] Helm chart deployment failed: {e}"),
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::Custom("Zitadel")
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
17
harmony_node_readiness/Cargo.toml
Normal file
17
harmony_node_readiness/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "harmony-node-readiness-endpoint"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
actix-web = "4"
|
||||
kube.workspace = true
|
||||
k8s-openapi.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
tokio.workspace = true
|
||||
reqwest.workspace = true
|
||||
chrono.workspace = true
|
||||
tower = "0.5.3"
|
||||
13
harmony_node_readiness/Dockerfile
Normal file
13
harmony_node_readiness/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM debian:13-slim
|
||||
|
||||
# RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
# ca-certificates \
|
||||
# && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY harmony-node-readiness-endpoint /usr/local/bin/harmony-node-readiness-endpoint
|
||||
|
||||
ENV RUST_LOG=info
|
||||
|
||||
EXPOSE 25001
|
||||
|
||||
CMD ["harmony-node-readiness-endpoint"]
|
||||
197
harmony_node_readiness/README.md
Normal file
197
harmony_node_readiness/README.md
Normal file
@@ -0,0 +1,197 @@
|
||||
# harmony-node-readiness-endpoint
|
||||
|
||||
**A lightweight, standalone Rust service for Kubernetes node health checking.**
|
||||
|
||||
Designed for **bare-metal Kubernetes clusters** with external load balancers (HAProxy, OPNsense, F5, etc.).
|
||||
|
||||
Exposes a simple HTTP endpoint (`/health`) on each node:
|
||||
|
||||
- **200 OK** — node is healthy and ready to receive traffic
|
||||
- **503 Service Unavailable** — node should be removed from the load balancer pool
|
||||
- **500 Internal Server Error** — misconfiguration (e.g. `NODE_NAME` not set)
|
||||
|
||||
This project is **not dependent on Harmony**, but is commonly used as part of Harmony bare-metal Kubernetes deployments.
|
||||
|
||||
## Why this project exists
|
||||
|
||||
In bare-metal environments, external load balancers often rely on pod-level or router-level checks that can lag behind the authoritative Kubernetes `Node.status.conditions[Ready]`.
|
||||
This service provides the true source-of-truth with fast reaction time.
|
||||
|
||||
## Available checks
|
||||
|
||||
| Check name | Description | Status |
|
||||
|--------------------|-------------------------------------------------------------|-------------------|
|
||||
| `node_ready` | Queries `Node.status.conditions[Ready]` via Kubernetes API | Implemented |
|
||||
| `okd_router_1936` | Probes OpenShift router `/healthz/ready` on port 1936 | Implemented |
|
||||
| `filesystem_ro` | Detects read-only mounts via `/proc/mounts` | To be implemented |
|
||||
| `kubelet` | Local probe to kubelet `/healthz` (port 10248) | To be implemented |
|
||||
| `container_runtime`| Socket check + runtime status | To be implemented |
|
||||
| `disk_pressure` | Threshold checks on key filesystems | To be implemented |
|
||||
| `network` | DNS resolution + gateway connectivity | To be implemented |
|
||||
| `custom_conditions`| Reacts to extra conditions (NPD, etc.) | To be implemented |
|
||||
|
||||
All checks are combined with logical **AND** — any single failure results in 503.
|
||||
|
||||
## Behavior
|
||||
|
||||
### `node_ready` check — fail-open design
|
||||
|
||||
The `node_ready` check queries the Kubernetes API server to read `Node.status.conditions[Ready]`.
|
||||
Because this service runs on the node it is checking, there are scenarios where the API server is temporarily
|
||||
unreachable (e.g. during a control-plane restart). To avoid incorrectly draining a healthy node in such cases,
|
||||
the check is **fail-open**: it passes (reports ready) whenever the Kubernetes API is unavailable.
|
||||
|
||||
| Situation | Result | HTTP status |
|
||||
|------------------------------------------------------|-------------------|-------------|
|
||||
| `Node.conditions[Ready] == True` | Pass | 200 |
|
||||
| `Node.conditions[Ready] == False` | Fail | 503 |
|
||||
| `Ready` condition absent | Fail | 503 |
|
||||
| API server unreachable or timed out (1 s timeout) | Pass (assumes ready) | 200 |
|
||||
| Kubernetes client initialization failed | Pass (assumes ready) | 200 |
|
||||
| `NODE_NAME` env var not set | Hard error | 500 |
|
||||
|
||||
A warning is logged whenever the API is unavailable and the check falls back to assuming ready.
|
||||
|
||||
### `okd_router_1936` check
|
||||
|
||||
Sends `GET http://127.0.0.1:1936/healthz/ready` with a 5-second timeout.
|
||||
Returns pass on any 2xx response, fail otherwise.
|
||||
|
||||
### Unknown check names
|
||||
|
||||
Requesting an unknown check name (e.g. `check=bogus`) results in that check returning `passed: false`
|
||||
with reason `"Unknown check: bogus"`, and the overall response is 503.
|
||||
|
||||
## How it works
|
||||
|
||||
### Node name discovery
|
||||
|
||||
The service reads the `NODE_NAME` environment variable, which must be injected via the Kubernetes Downward API:
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
```
|
||||
|
||||
### Kubernetes API authentication
|
||||
|
||||
- Uses standard **in-cluster configuration** — no external credentials needed.
|
||||
- The ServiceAccount token and CA certificate are automatically mounted at `/var/run/secrets/kubernetes.io/serviceaccount/`.
|
||||
- Requires only minimal RBAC: `get` and `list` on the `nodes` resource (see `deploy/resources.yaml`).
|
||||
- Connect and write timeouts are set to **1 second** to keep checks fast.
|
||||
|
||||
## Deploy
|
||||
|
||||
All Kubernetes resources (Namespace, ServiceAccount, ClusterRole, ClusterRoleBinding, and an OpenShift SCC RoleBinding for `hostnetwork`) are in a single file.
|
||||
|
||||
```bash
|
||||
kubectl apply -f deploy/resources.yaml
|
||||
kubectl apply -f deploy/daemonset.yaml
|
||||
```
|
||||
|
||||
The DaemonSet uses `hostNetwork: true` and `hostPort: 25001`, so the endpoint is reachable directly on the node's IP at port 25001.
|
||||
It tolerates all taints, ensuring it runs even on nodes marked unschedulable.
|
||||
|
||||
### Configure your external load balancer
|
||||
|
||||
**Example for HAProxy / OPNsense:**
|
||||
- Check type: **HTTP**
|
||||
- URI: `/health`
|
||||
- Port: `25001` (configurable via `LISTEN_PORT` env var)
|
||||
- Interval: 5–10 s
|
||||
- Rise: 2
|
||||
- Fall: 3
|
||||
- Expect: `2xx`
|
||||
|
||||
## Endpoint usage
|
||||
|
||||
### Query parameter
|
||||
|
||||
Use the `check` query parameter to select which checks to run (comma-separated).
|
||||
When omitted, only `node_ready` runs.
|
||||
|
||||
| Request | Checks run |
|
||||
|------------------------------------------------|-----------------------------------|
|
||||
| `GET /health` | `node_ready` |
|
||||
| `GET /health?check=okd_router_1936` | `okd_router_1936` only |
|
||||
| `GET /health?check=node_ready,okd_router_1936` | `node_ready` and `okd_router_1936`|
|
||||
|
||||
> **Note:** specifying `check=` replaces the default. Include `node_ready` explicitly if you need it alongside other checks.
|
||||
|
||||
### Response format
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "ready" | "not-ready",
|
||||
"checks": [
|
||||
{
|
||||
"name": "<check-name>",
|
||||
"passed": true | false,
|
||||
"reason": "<failure reason, omitted on success>",
|
||||
"duration_ms": 42
|
||||
}
|
||||
],
|
||||
"total_duration_ms": 42
|
||||
}
|
||||
```
|
||||
|
||||
**Healthy node (default)**
|
||||
```http
|
||||
HTTP/1.1 200 OK
|
||||
|
||||
{
|
||||
"status": "ready",
|
||||
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 42 }],
|
||||
"total_duration_ms": 42
|
||||
}
|
||||
```
|
||||
|
||||
**Unhealthy node**
|
||||
```http
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
|
||||
{
|
||||
"status": "not-ready",
|
||||
"checks": [
|
||||
{ "name": "node_ready", "passed": false, "reason": "KubeletNotReady", "duration_ms": 35 }
|
||||
],
|
||||
"total_duration_ms": 35
|
||||
}
|
||||
```
|
||||
|
||||
**API server unreachable (fail-open)**
|
||||
```http
|
||||
HTTP/1.1 200 OK
|
||||
|
||||
{
|
||||
"status": "ready",
|
||||
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 1001 }],
|
||||
"total_duration_ms": 1001
|
||||
}
|
||||
```
|
||||
*(A warning is logged: `Kubernetes API appears to be down … Assuming node is ready.`)*
|
||||
|
||||
## Configuration
|
||||
|
||||
| Env var | Default | Description |
|
||||
|---------------|----------|--------------------------------------|
|
||||
| `NODE_NAME` | required | Node name, injected via Downward API |
|
||||
| `LISTEN_PORT` | `25001` | TCP port the HTTP server binds to |
|
||||
| `RUST_LOG` | — | Log level (e.g. `info`, `debug`) |
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
# Run locally
|
||||
NODE_NAME=my-test-node cargo run
|
||||
|
||||
# Run tests
|
||||
cargo test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
||||
13
harmony_node_readiness/build-docker.sh
Executable file
13
harmony_node_readiness/build-docker.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# TODO
|
||||
# This is meant to be run on a machine with harmony development tools installed (cargo, etc)
|
||||
|
||||
DOCKER_TAG="${DOCKER_TAG:-dev}"
|
||||
|
||||
cargo build --release
|
||||
|
||||
cp ../target/release/harmony-node-readiness-endpoint .
|
||||
|
||||
docker build . -t hub.nationtech.io/harmony/harmony-node-readiness-endpoint:${DOCKER_TAG}
|
||||
|
||||
36
harmony_node_readiness/deploy/daemonset.yaml
Normal file
36
harmony_node_readiness/deploy/daemonset.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-healthcheck
|
||||
namespace: harmony-node-healthcheck
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-healthcheck
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-healthcheck
|
||||
spec:
|
||||
serviceAccountName: node-healthcheck-sa
|
||||
hostNetwork: true
|
||||
# This ensures the pod runs even if the node is already "unschedulable"
|
||||
# so it can report the status correctly.
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
containers:
|
||||
- name: checker
|
||||
image: hub.nationtech.io/harmony/harmony-node-readiness-endpoint:latest
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
ports:
|
||||
- containerPort: 25001
|
||||
hostPort: 25001
|
||||
name: health-port
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 50Mi
|
||||
64
harmony_node_readiness/deploy/resources.yaml
Normal file
64
harmony_node_readiness/deploy/resources.yaml
Normal file
@@ -0,0 +1,64 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: harmony-node-healthcheck
|
||||
labels:
|
||||
name: harmony-node-healthcheck
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-healthcheck-sa
|
||||
namespace: harmony-node-healthcheck
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: node-healthcheck-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: allow-hostnetwork-scc
|
||||
namespace: harmony-node-healthcheck
|
||||
rules:
|
||||
- apiGroups: ["security.openshift.io"]
|
||||
resources: ["securitycontextconstraints"]
|
||||
resourceNames: ["hostnetwork"]
|
||||
verbs: ["use"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: node-status-querier-scc-binding
|
||||
namespace: harmony-node-healthcheck
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-healthcheck-sa
|
||||
namespace: harmony-node-healthcheck
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: allow-hostnetwork-scc
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: read-nodes-binding
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-healthcheck-sa
|
||||
namespace: harmony-node-healthcheck
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: node-healthcheck-role
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
282
harmony_node_readiness/src/main.rs
Normal file
282
harmony_node_readiness/src/main.rs
Normal file
@@ -0,0 +1,282 @@
|
||||
use actix_web::{App, HttpResponse, HttpServer, Responder, get, web};
|
||||
use k8s_openapi::api::core::v1::Node;
|
||||
use kube::{Api, Client, Config};
|
||||
|
||||
use log::{debug, error, info, warn};
|
||||
use reqwest;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::task::JoinSet;
|
||||
|
||||
const K8S_CLIENT_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct HealthStatus {
|
||||
status: String,
|
||||
checks: Vec<CheckResult>,
|
||||
total_duration_ms: u128,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct CheckResult {
|
||||
name: String,
|
||||
passed: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
reason: Option<String>,
|
||||
duration_ms: u128,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct HealthError {
|
||||
status: String,
|
||||
error: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct HealthQuery {
|
||||
#[serde(rename = "check")]
|
||||
checks: Option<String>,
|
||||
}
|
||||
|
||||
/// Check if the node's Ready condition is true via Kubernetes API
|
||||
async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String> {
|
||||
let nodes: Api<Node> = Api::all(client);
|
||||
|
||||
let node = match nodes.get(node_name).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Kubernetes API appears to be down, unreachable, or timed out for node '{}': {}. Assuming node is ready.",
|
||||
node_name, e
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let conditions = node.status.and_then(|s| s.conditions).unwrap_or_default();
|
||||
|
||||
for condition in conditions {
|
||||
if condition.type_ == "Ready" {
|
||||
let is_ready = condition.status == "True";
|
||||
let reason = condition
|
||||
.reason
|
||||
.clone()
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
|
||||
if !is_ready {
|
||||
return Err(reason);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
Err("Ready condition not found".to_string())
|
||||
}
|
||||
|
||||
/// Check OKD router health endpoint on port 1936
|
||||
async fn check_okd_router_1936() -> Result<(), String> {
|
||||
debug!("Checking okd router 1936");
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(5))
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
|
||||
|
||||
let response = client
|
||||
.get("http://127.0.0.1:1936/healthz/ready")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to connect to OKD router: {}", e))?;
|
||||
|
||||
debug!("okd router 1936 response status {}", response.status());
|
||||
|
||||
if response.status().is_success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(format!("OKD router returned status: {}", response.status()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse comma-separated check names from query parameter
|
||||
fn parse_checks(checks_param: Option<&str>) -> Vec<String> {
|
||||
match checks_param {
|
||||
None => vec!["node_ready".to_string()],
|
||||
Some(s) if s.is_empty() => vec!["node_ready".to_string()],
|
||||
Some(s) => s.split(',').map(|c| c.trim().to_string()).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run a single health check by name and return the result
|
||||
async fn run_check(check_name: &str, client: Option<Client>, node_name: &str) -> CheckResult {
|
||||
let start = Instant::now();
|
||||
|
||||
let result = match check_name {
|
||||
"node_ready" => match client {
|
||||
Some(c) => check_node_ready(c, node_name).await,
|
||||
None => {
|
||||
warn!(
|
||||
"Kubernetes client not available for node '{}'. Assuming node is ready.",
|
||||
node_name
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
"okd_router_1936" => check_okd_router_1936().await,
|
||||
_ => Err(format!("Unknown check: {}", check_name)),
|
||||
};
|
||||
|
||||
let duration_ms = start.elapsed().as_millis();
|
||||
|
||||
match result {
|
||||
Ok(()) => CheckResult {
|
||||
name: check_name.to_string(),
|
||||
passed: true,
|
||||
reason: None,
|
||||
duration_ms,
|
||||
},
|
||||
Err(reason) => CheckResult {
|
||||
name: check_name.to_string(),
|
||||
passed: false,
|
||||
reason: Some(reason),
|
||||
duration_ms,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[get("/health")]
|
||||
async fn health(query: web::Query<HealthQuery>) -> impl Responder {
|
||||
let node_name = match env::var("NODE_NAME") {
|
||||
Ok(name) => name,
|
||||
Err(_) => {
|
||||
error!("NODE_NAME environment variable not set");
|
||||
return HttpResponse::InternalServerError().json(HealthError {
|
||||
status: "error".to_string(),
|
||||
error: "NODE_NAME environment variable not set".to_string(),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Parse requested checks from query parameter
|
||||
let requested_checks = parse_checks(query.checks.as_deref());
|
||||
|
||||
// Check if node_ready check requires Kubernetes client
|
||||
let needs_k8s_client = requested_checks.contains(&"node_ready".to_string());
|
||||
|
||||
// Initialize Kubernetes client only if needed
|
||||
let k8s_client = if needs_k8s_client {
|
||||
match Config::infer().await {
|
||||
Ok(mut config) => {
|
||||
config.write_timeout = Some(K8S_CLIENT_TIMEOUT);
|
||||
config.connect_timeout = Some(K8S_CLIENT_TIMEOUT);
|
||||
Some(Client::try_from(config).map_err(|e| e.to_string()))
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to infer Kubernetes config for node '{}': {}. Assuming node_ready is healthy.",
|
||||
node_name, e
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
.and_then(|result| match result {
|
||||
Ok(client) => Some(client),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to create Kubernetes client for node '{}': {}. Assuming node_ready is healthy.",
|
||||
node_name, e
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Run all requested checks in parallel
|
||||
let start = Instant::now();
|
||||
let mut join_set = JoinSet::new();
|
||||
debug!("Running checks {requested_checks:?}");
|
||||
|
||||
for check_name in requested_checks {
|
||||
let client = k8s_client.clone();
|
||||
let node_name = node_name.clone();
|
||||
join_set.spawn(async move { run_check(&check_name, client, &node_name).await });
|
||||
}
|
||||
let mut check_results = Vec::new();
|
||||
while let Some(result) = join_set.join_next().await {
|
||||
match result {
|
||||
Ok(check) => check_results.push(check),
|
||||
Err(e) => error!("Check task failed: {}", e),
|
||||
}
|
||||
}
|
||||
let total_duration_ms = start.elapsed().as_millis();
|
||||
|
||||
// Determine overall status
|
||||
let all_passed = check_results.iter().all(|c| c.passed);
|
||||
|
||||
if all_passed {
|
||||
info!(
|
||||
"All health checks passed for node '{}' in {}ms",
|
||||
node_name, total_duration_ms
|
||||
);
|
||||
HttpResponse::Ok().json(HealthStatus {
|
||||
status: "ready".to_string(),
|
||||
checks: check_results,
|
||||
total_duration_ms,
|
||||
})
|
||||
} else {
|
||||
let failed_checks: Vec<&str> = check_results
|
||||
.iter()
|
||||
.filter(|c| !c.passed)
|
||||
.map(|c| c.name.as_str())
|
||||
.collect();
|
||||
warn!(
|
||||
"Health checks failed for node '{}' in {}ms: {:?}",
|
||||
node_name, total_duration_ms, failed_checks
|
||||
);
|
||||
HttpResponse::ServiceUnavailable().json(HealthStatus {
|
||||
status: "not-ready".to_string(),
|
||||
checks: check_results,
|
||||
total_duration_ms,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[actix_web::main]
|
||||
async fn main() -> std::io::Result<()> {
|
||||
env_logger::init();
|
||||
|
||||
let port = env::var("LISTEN_PORT").unwrap_or_else(|_| "25001".to_string());
|
||||
let port = port
|
||||
.parse::<u16>()
|
||||
.unwrap_or_else(|_| panic!("Invalid port number: {}", port));
|
||||
let bind_addr = format!("0.0.0.0:{}", port);
|
||||
|
||||
info!("Starting harmony-node-readiness-endpoint on {}", bind_addr);
|
||||
|
||||
HttpServer::new(|| App::new().service(health))
|
||||
.workers(3)
|
||||
.bind(&bind_addr)?
|
||||
.run()
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kube::error::ErrorResponse;
|
||||
|
||||
#[test]
|
||||
fn parse_checks_defaults_to_node_ready() {
|
||||
assert_eq!(parse_checks(None), vec!["node_ready"]);
|
||||
assert_eq!(parse_checks(Some("")), vec!["node_ready"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_checks_splits_and_trims_values() {
|
||||
assert_eq!(
|
||||
parse_checks(Some("node_ready, okd_router_1936 ")),
|
||||
vec!["node_ready", "okd_router_1936"]
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,7 @@ http.workspace = true
|
||||
inquire.workspace = true
|
||||
interactive-parse = "0.1.5"
|
||||
schemars = "0.8"
|
||||
vaultrs = "0.7.4"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions.workspace = true
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user