doc: monitoring module documentation

fix: Finish merging k8s refactoring
chore: Fix some warnings
2026-03-09 18:33:35 -04:00 · 2026-03-09 17:20:03 -04:00 · 2026-03-09 17:17:12 -04:00 · 2026-03-09 17:12:39 -04:00 · 2026-03-07 23:05:09 +00:00 · 2026-03-07 17:56:26 -05:00
323 changed files with 21295 additions and 7744 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,6 @@
 target/
-Dockerfile
+Dockerfile
+.git
+data
+target
+demos
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,8 @@ Cargo.lock

 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
+
+.harmony_generated
+
+# Useful to create ignore folders for temp files and notes
+ignore
--- a/.sqlx/query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json
+++ b/.sqlx/query-24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b.json
@@ -0,0 +1,26 @@
+{
+  "db_name": "SQLite",
+  "query": "SELECT host_id, installation_device FROM host_role_mapping WHERE role = ?",
+  "describe": {
+    "columns": [
+      {
+        "name": "host_id",
+        "ordinal": 0,
+        "type_info": "Text"
+      },
+      {
+        "name": "installation_device",
+        "ordinal": 1,
+        "type_info": "Text"
+      }
+    ],
+    "parameters": {
+      "Right": 1
+    },
+    "nullable": [
+      false,
+      true
+    ]
+  },
+  "hash": "24f719d57144ecf4daa55f0aa5836c165872d70164401c0388e8d625f1b72d7b"
+}
--- a/.sqlx/query-2ea29df2326f7c84bd4100ad510a3fd4878dc2e217dc83f9bf45a402dfd62a91.json
+++ b/.sqlx/query-2ea29df2326f7c84bd4100ad510a3fd4878dc2e217dc83f9bf45a402dfd62a91.json
@@ -1,20 +0,0 @@
-{
-  "db_name": "SQLite",
-  "query": "SELECT host_id FROM host_role_mapping WHERE role = ?",
-  "describe": {
-    "columns": [
-      {
-        "name": "host_id",
-        "ordinal": 0,
-        "type_info": "Text"
-      }
-    ],
-    "parameters": {
-      "Right": 1
-    },
-    "nullable": [
-      false
-    ]
-  },
-  "hash": "2ea29df2326f7c84bd4100ad510a3fd4878dc2e217dc83f9bf45a402dfd62a91"
-}
--- a/.sqlx/query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json
+++ b/.sqlx/query-6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6.json
@@ -1,12 +1,12 @@
 {
  "db_name": "SQLite",
-  "query": "\n        INSERT INTO host_role_mapping (host_id, role)\n        VALUES (?, ?)\n        ",
+  "query": "\n        INSERT INTO host_role_mapping (host_id, role, installation_device)\n        VALUES (?, ?, ?)\n        ",
  "describe": {
    "columns": [],
    "parameters": {
-      "Right": 2
+      "Right": 3
    },
    "nullable": []
  },
-  "hash": "df7a7c9cfdd0972e2e0ce7ea444ba8bc9d708a4fb89d5593a0be2bbebde62aff"
+  "hash": "6fcc29cfdbdf3b2cee94a4844e227f09b245dd8f079832a9a7b774151cb03af6"
 }
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,11 +2,11 @@
 resolver = "2"
 members = [
  "private_repos/*",
-  "examples/*",
  "harmony",
  "harmony_types",
  "harmony_macros",
  "harmony_tui",
+  "harmony_execution",
  "opnsense-config",
  "opnsense-config-xml",
  "harmony_cli",
@@ -16,7 +16,9 @@ members = [
  "harmony_secret_derive",
  "harmony_secret",
  "adr/agent_discovery/mdns",
-  "brocade",
+   "brocade",
+   "harmony_agent",
+   "harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
 ]

 [workspace.package]
@@ -35,6 +37,8 @@ tokio = { version = "1.40", features = [
  "macros",
  "rt-multi-thread",
 ] }
+tokio-retry = "0.3.0"
+tokio-util = "0.7.15"
 cidr = { features = ["serde"], version = "0.2" }
 russh = "0.45"
 russh-keys = "0.45"
@@ -49,6 +53,7 @@ kube = { version = "1.1.0", features = [
  "jsonpatch",
 ] }
 k8s-openapi = { version = "0.25", features = ["v1_30"] }
+# TODO replace with https://github.com/bourumir-wyngs/serde-saphyr as serde_yaml is deprecated https://github.com/sebastienrousseau/serde_yml
 serde_yaml = "0.9"
 serde-value = "0.7"
 http = "1.2"
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
-# Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
+# Harmony
+
+Open-source infrastructure orchestration that treats your platform like first-class code.
+
+In other words, Harmony is a **next-generation platform engineering framework**.

 _By [NationTech](https://nationtech.io)_

@@ -18,9 +22,7 @@ All in **one strongly-typed Rust codebase**.

 From a **developer laptop** to a **global production cluster**, a single **source of truth** drives the **full software lifecycle.**

---
-
-## 1 · The Harmony Philosophy
+## The Harmony Philosophy

 Infrastructure is essential, but it shouldn’t be your core business. Harmony is built on three guiding principles that make modern platforms reliable, repeatable, and easy to reason about.

@@ -32,9 +34,18 @@ Infrastructure is essential, but it shouldn’t be your core business. Harmony i

 These principles surface as simple, ergonomic Rust APIs that let teams focus on their product while trusting the platform underneath.

---
+## Where to Start

-## 2 · Quick Start
+We have a comprehensive set of documentation right here in the repository.
+
+| I want to...      | Start Here                                                         |
+| ----------------- | ------------------------------------------------------------------ |
+| Get Started       | [Getting Started Guide](./docs/guides/getting-started.md)          |
+| See an Example    | [Use Case: Deploy a Rust Web App](./docs/use-cases/rust-webapp.md) |
+| Explore           | [Documentation Hub](./docs/README.md)                              |
+| See Core Concepts | [Core Concepts Explained](./docs/concepts.md)                      |
+
+## Quick Look: Deploy a Rust Webapp

 The snippet below spins up a complete **production-grade Rust + Leptos Webapp** with monitoring. Swap it for your own scores to deploy anything from microservices to machine-learning pipelines.

@@ -92,63 +103,33 @@ async fn main() {
 }
 ```

-Run it:
+To run this:

-```bash
-cargo run
-```
+- Clone the repository: `git clone https://git.nationtech.io/nationtech/harmony`
+- Install dependencies: `cargo build --release`
+- Run the example: `cargo run --example try_rust_webapp`

-Harmony analyses the code, shows an execution plan in a TUI, and applies it once you confirm. Same code, same binary—every environment.
+## Documentation

---
+All documentation is in the `/docs` directory.

-## 3 · Core Concepts
+- [Documentation Hub](./docs/README.md): The main entry point for all documentation.
+- [Core Concepts](./docs/concepts.md): A detailed look at Score, Topology, Capability, Inventory, and Interpret.
+- [Component Catalogs](./docs/catalogs/README.md): Discover all available Scores, Topologies, and Capabilities.
+- [Developer Guide](./docs/guides/developer-guide.md): Learn how to write your own Scores and Topologies.

-| Term             | One-liner                                                                                            |
-| ---------------- | ---------------------------------------------------------------------------------------------------- |
-| **Score<T>**     | Declarative description of the desired state (e.g., `LAMPScore`).                                    |
-| **Interpret<T>** | Imperative logic that realises a `Score` on a specific environment.                                  |
-| **Topology**     | An environment (local k3d, AWS, bare-metal) exposing verified _Capabilities_ (Kubernetes, DNS, …).   |
-| **Maestro**      | Orchestrator that compiles Scores + Topology, ensuring all capabilities line up **at compile-time**. |
-| **Inventory**    | Optional catalogue of physical assets for bare-metal and edge deployments.                           |
+## Architectural Decision Records

-A visual overview is in the diagram below.
+- [ADR-001 · Why Rust](adr/001-rust.md)
+- [ADR-003 · Infrastructure Abstractions](adr/003-infrastructure-abstractions.md)
+- [ADR-006 · Secret Management](adr/006-secret-management.md)
+- [ADR-011 · Multi-Tenant Cluster](adr/011-multi-tenant-cluster.md)

-[Harmony Core Architecture](docs/diagrams/Harmony_Core_Architecture.drawio.svg)
+## Contribute

---
+Discussions and roadmap live in [Issues](https://git.nationtech.io/nationtech/harmony/-/issues). PRs, ideas, and feedback are welcome!

-## 4 · Install
-
-Prerequisites:
-
- Rust
- Docker (if you deploy locally)
- `kubectl` / `helm` for Kubernetes-based topologies
-
-```bash
-git clone https://git.nationtech.io/nationtech/harmony
-cd harmony
-cargo build --release          # builds the CLI, TUI and libraries
-```
-
---
-
-## 5 · Learning More
-
- **Architectural Decision Records** – dive into the rationale
-  - [ADR-001 · Why Rust](adr/001-rust.md)
-  - [ADR-003 · Infrastructure Abstractions](adr/003-infrastructure-abstractions.md)
-  - [ADR-006 · Secret Management](adr/006-secret-management.md)
-  - [ADR-011 · Multi-Tenant Cluster](adr/011-multi-tenant-cluster.md)
-
- **Extending Harmony** – write new Scores / Interprets, add hardware like OPNsense firewalls, or embed Harmony in your own tooling (`/docs`).
-
- **Community** – discussions and roadmap live in [GitLab issues](https://git.nationtech.io/nationtech/harmony/-/issues). PRs, ideas, and feedback are welcome!
-
---
-
-## 6 · License
+## License

 Harmony is released under the **GNU AGPL v3**.

--- a/adr/017-1-Nats-Clusters-Interconnection-Topology.md
+++ b/adr/017-1-Nats-Clusters-Interconnection-Topology.md
@@ -0,0 +1,189 @@
+### 1. ADR 017-1: NATS Cluster Interconnection & Trust Topology
+
+# Architecture Decision Record: NATS Cluster Interconnection & Trust Topology
+
+**Status:** Proposed
+**Date:** 2026-01-12
+**Precedes:** [017-Staleness-Detection-for-Failover.md]
+
+## Context
+
+In ADR 017, we defined the failover mechanisms for the Harmony mesh. However, for a Primary (Site A) and a Replica (Site B) to communicate securely—or for the Global Mesh to function across disparate locations—we must establish a robust Transport Layer Security (TLS) strategy.
+
+Our primary deployment platform is OKD (Kubernetes). While OKD provides an internal `service-ca`, it is designed primarily for intra-cluster service-to-service communication. It lacks the flexibility required for:
+1.  **Public/External Gateway Identities:** NATS Gateways need to identify themselves via public DNS names or external IPs, not just internal `.svc` cluster domains.
+2.  **Cross-Cluster Trust:** We need a mechanism to allow Cluster A to trust Cluster B without sharing a single private root key.
+
+## Decision
+
+We will implement an **"Islands of Trust"** topology using **cert-manager** on OKD.
+
+### 1. Per-Cluster Certificate Authorities (CA)
+
+* We explicitly **reject** the use of a single "Supercluster CA" shared across all sites.
+    * Instead, every Harmony Cluster (Site A, Site B, etc.) will generate its own unique Self-Signed Root CA managed by `cert-manager` inside that cluster.
+*   **Lifecycle:** Root CAs will have a long duration (e.g., 10 years) to minimize rotation friction, while Leaf Certificates (NATS servers) will remain short-lived (e.g., 90 days) and rotate automatically.
+
+> Note : The decision to have a single CA for various workloads managed by Harmony on each deployment, or to have multiple CA for each service that requires interconnection is not made yet. This ADR leans towards one CA per service. This allows for maximum flexibility. But the direction might change and no clear decision has been made yet. The alternative of establishing that each cluster/harmony deployment has a single identity could make mTLS very simple between tenants.
+
+### 2. Trust Federation via Bundle Exchange
+
+To enable secure communication (mTLS) between clusters (e.g., for NATS Gateways or Leaf Nodes):
+
+*   **No Private Keys are shared.**
+*   We will aggregate the **Public CA Certificates** of all trusted clusters into a shared `ca-bundle.pem`.
+*   This bundle is distributed to the NATS configuration of every node.
+*   **Verification Logic:** When Site A connects to Site B, Site A verifies Site B's certificate against the bundle. Since Site B's CA public key is in the bundle, the connection is accepted.
+
+### 3. Tooling
+
+* We will use **cert-manager** (deployed via Operator on OKD) rather than OKD's built-in `service-ca`. This provides us with standard CRDs (`Issuer`, `Certificate`) to manage the lifecycle, rotation, and complex SANs (Subject Alternative Names) required for external connectivity.
+* Harmony will manage installation, configuration and bundle creation across all sites
+
+## Rationale
+
+**Security Blast Radius (The "Key Leak" Scenario)**
+If we used a single global CA and the private key for Site A was compromised (e.g., physical theft of a server from a basement), the attacker could impersonate *any* site in the global mesh.
+By using Per-Cluster CAs:
+*   If Site A is compromised, only Site A's identity is stolen.
+*   We can "evict" Site A from the mesh simply by removing Site A's Public CA from the `ca-bundle.pem` on the remaining healthy clusters and reloading. The attacker can no longer authenticate.
+
+**Decentralized Autonomy**
+This aligns with the "Humane Computing" vision. A local cluster owns its identity. It does not depend on a central authority to issue its certificates. It can function in isolation (offline) indefinitely without needing to "phone home" to renew credentials.
+
+## Consequences
+
+**Positive**
+*   **High Security:** Compromise of one node does not compromise the global mesh.
+*   **Flexibility:** Easier to integrate with third-party clusters or partners by simply adding their public CA to the bundle.
+*   **Standardization:** `cert-manager` is the industry standard, making the configuration portable to non-OKD K8s clusters if needed.
+
+**Negative**
+*   **Configuration Complexity:** We must manage a mechanism to distribute the `ca-bundle.pem` containing public keys to all sites. This should be automated (e.g., via a Harmony Agent) to ensure timely updates and revocation.
+*   **Revocation Latency:** Revoking a compromised cluster requires updating and reloading the bundle on all other clusters. This is slower than OCSP/CRL but acceptable for infrastructure-level trust if automation is in place.
+
+---
+
+# 2. Concrete overview of the process, how it can be implemented manually across multiple OKD clusters
+
+All of this will be automated via Harmony, but to understand correctly the process it is outlined in details here :
+
+## 1. Deploying and Configuring cert-manager on OKD
+
+While OKD has a built-in `service-ca` controller, it is "opinionated" and primarily signs certs for internal services (like `my-svc.my-namespace.svc`). It is **not suitable** for the Harmony Global Mesh because you cannot easily control the Subject Alternative Names (SANs) for external routes (e.g., `nats.site-a.nationtech.io`), nor can you easily export its CA to other clusters.
+
+**The Solution:** Use the **cert-manager Operator for Red Hat OpenShift**.
+
+### Step 1: Install the Operator
+1.  Log in to the OKD Web Console.
+2.  Navigate to **Operators** -> **OperatorHub**.
+3.  Search for **"cert-manager"**.
+4.  Choose the **"cert-manager Operator for Red Hat OpenShift"** (Red Hat provided) or the community version.
+5.  Click **Install**. Use the default settings (Namespace: `cert-manager-operator`).
+
+### Step 2: Create the "Island" CA (The Issuer)
+Once installed, you define your cluster's unique identity. Apply this YAML to your NATS namespace.
+
+```yaml
+# filepath: k8s/01-issuer.yaml
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  name: harmony-selfsigned-issuer
+  namespace: harmony-nats
+spec:
+  selfSigned: {}
+---
+# This generates the unique Root CA for THIS specific cluster
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: harmony-root-ca
+  namespace: harmony-nats
+spec:
+  isCA: true
+  commonName: "harmony-site-a-ca" # CHANGE THIS per cluster (e.g., site-b-ca)
+  duration: 87600h # 10 years
+  renewBefore: 2160h # 3 months before expiry
+  secretName: harmony-root-ca-secret
+  privateKey:
+    algorithm: ECDSA
+    size: 256
+  issuerRef:
+    name: harmony-selfsigned-issuer
+    kind: Issuer
+    group: cert-manager.io
+---
+# This Issuer uses the Root CA generated above to sign NATS certs
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  name: harmony-ca-issuer
+  namespace: harmony-nats
+spec:
+  ca:
+    secretName: harmony-root-ca-secret
+```
+
+### Step 3: Generate the NATS Server Certificate
+This certificate will be used by the NATS server. It includes both internal DNS names (for local clients) and external DNS names (for the global mesh).
+
+```yaml
+# filepath: k8s/02-nats-cert.yaml
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: nats-server-cert
+  namespace: harmony-nats
+spec:
+  secretName: nats-server-tls
+  duration: 2160h # 90 days
+  renewBefore: 360h # 15 days
+  issuerRef:
+    name: harmony-ca-issuer
+    kind: Issuer
+  # CRITICAL: Define all names this server can be reached by
+  dnsNames:
+  - "nats"
+  - "nats.harmony-nats.svc"
+  - "nats.harmony-nats.svc.cluster.local"
+  - "*.nats.harmony-nats.svc.cluster.local"
+  - "nats-gateway.site-a.nationtech.io" # External Route for Mesh
+```
+
+## 2. Implementing the "Islands of Trust" (Trust Bundle)
+
+To make Site A and Site B talk, you need to exchange **Public Keys**.
+
+1.  **Extract Public CA from Site A:**
+    ```bash
+    oc get secret harmony-root-ca-secret -n harmony-nats -o jsonpath='{.data.ca\.crt}' | base64 -d > site-a.crt
+    ```
+2.  **Extract Public CA from Site B:**
+    ```bash
+    oc get secret harmony-root-ca-secret -n harmony-nats -o jsonpath='{.data.ca\.crt}' | base64 -d > site-b.crt
+    ```
+3.  **Create the Bundle:**
+    Combine them into one file.
+    ```bash
+    cat site-a.crt site-b.crt > ca-bundle.crt
+    ```
+4.  **Upload Bundle to Both Clusters:**
+    Create a ConfigMap or Secret in *both* clusters containing this combined bundle.
+    ```bash
+    oc create configmap nats-trust-bundle --from-file=ca.crt=ca-bundle.crt -n harmony-nats
+    ```
+5.  **Configure NATS:**
+    Mount this ConfigMap and point NATS to it.
+
+    ```conf
+    # nats.conf snippet
+    tls {
+      cert_file: "/etc/nats-certs/tls.crt"
+      key_file:  "/etc/nats-certs/tls.key"
+      # Point to the bundle containing BOTH Site A and Site B public CAs
+      ca_file:   "/etc/nats-trust/ca.crt"
+    }
+    ```
+
+This setup ensures that Site A can verify Site B's certificate (signed by `harmony-site-b-ca`) because Site B's CA is in Site A's trust store, and vice versa, without ever sharing the private keys that generated them.
--- a/adr/018-Template-Hydration-For-Workload-Deployment.md
+++ b/adr/018-Template-Hydration-For-Workload-Deployment.md
@@ -0,0 +1,141 @@
+# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
+
+Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
+
+Initial Date: 2025-01-23
+
+Last Updated Date: 2025-01-23
+
+## Status
+
+Implemented
+
+## Context
+
+Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
+
+After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
+
+*   **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
+*   **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
+*   **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
+*   **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
+
+We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
+
+## Decision
+
+We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
+
+Specifically:
+
+*   **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
+*   **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
+*   **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
+*   **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
+
+The implementation in `backend_app.rs` demonstrates this pattern:
+
+```rust
+let deployment = Deployment {
+    metadata: ObjectMeta {
+        name: Some(self.name.clone()),
+        labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
+        ..Default::default()
+    },
+    spec: Some(DeploymentSpec { /* ... */ }),
+    ..Default::default()
+};
+
+let deployment_yaml = serde_yaml::to_string(&deployment)?;
+fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
+```
+
+## Rationale
+
+**Aligns with "Infrastructure as Resilient Code"**
+
+Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
+
+*   **Refactorability:** Rename a label and the compiler catches all usages.
+*   **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
+*   **Code Navigation:** Jump to definition shows exactly where a value comes from.
+
+**Achieves "Prove It Works — Before You Deploy"**
+
+The compiler now validates that:
+
+*   All required fields are populated (Rust's `Option` type prevents missing fields).
+*   Field types match expectations (ports are integers, not strings).
+*   Enums contain valid values (e.g., `ServiceType::ClusterIP`).
+
+This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
+
+**Enables True Unit Testing**
+
+Developers can now write unit tests that assert directly against typed objects:
+
+```rust
+let deployment = create_deployment(&app);
+assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
+assert_eq!(deployment.metadata.name.unwrap(), "my-app");
+```
+
+No string parsing, no YAML serialization, no fragile assertions against rendered output.
+
+**Preserves Ecosystem Benefits**
+
+By generating standard Helm chart structures, Harmony retains compatibility with:
+
+*   **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
+*   **ArgoCD:** Syncs and manages releases using the generated charts.
+*   **Existing Workflows:** Teams already consuming Helm charts see no change.
+
+The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
+
+## Consequences
+
+### Positive
+
+*   **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
+*   **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
+*   **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
+*   **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
+*   **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
+
+### Negative
+
+*   **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
+*   **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
+*   **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
+*   **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
+
+## Alternatives Considered
+
+### 1. Enhance Askama with Compile-Time Validation
+*Pros:* Stay within familiar templating paradigm; minimal code changes.
+*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
+
+### 2. Use Helm SDK Programmatically (Go)
+*Pros:* Direct access to Helm's template engine; no YAML serialization step.
+*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
+
+### 3. Raw YAML String Templating (Manual)
+*Pros:* Maximum control; no external dependencies.
+*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
+
+### 4. Use Kustomize for All Manifests
+*Pros:* Declarative overlays; standard tool.
+*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
+
+__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
+
+## Additional Notes
+
+**Scalability to Future Topologies**
+
+The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
+
+**Implementation Status**
+
+As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
--- a/adr/019-Network-bond-setup.md
+++ b/adr/019-Network-bond-setup.md
@@ -0,0 +1,65 @@
+# Architecture Decision Record: Network Bonding Configuration via External Automation
+
+Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
+
+Initial Date: 2026-02-13
+
+Last Updated Date: 2026-02-13
+
+## Status
+
+Accepted
+
+## Context
+
+We need to configure LACP bonds on 10GbE interfaces across all worker nodes in the OpenShift cluster. A significant challenge is that interface names (e.g., `enp1s0f0` vs `ens1f0`) vary across different hardware nodes.
+
+The standard OpenShift mechanism (MachineConfig) applies identical configurations to all nodes in a MachineConfigPool. Since the interface names differ, a single static MachineConfig cannot target specific physical devices across the entire cluster without complex workarounds.
+
+## Decision
+
+We will use the existing "Harmony" automation tool to generate and apply host-specific NetworkManager configuration files directly to the nodes.
+
+1.  Harmony will generate the specific `.nmconnection` files for the bond and slaves based on its inventory of interface names.
+2.  Files will be pushed to `/etc/NetworkManager/system-connections/` on each node.
+3.  Configuration will be applied via `nmcli` reload or a node reboot.
+
+## Rationale
+
+*   **Inventory Awareness:** Harmony already possesses the specific interface mapping data for each host.
+*   **Persistence:** Fedora CoreOS/SCOS allows writing to `/etc`, and these files persist across reboots and OS upgrades (rpm-ostree updates).
+*   **Avoids Complexity:** This approach avoids the operational overhead of creating unique MachineConfigPools for every single host or hardware variant.
+*   **Safety:** Unlike wildcard matching, this ensures explicit interface selection, preventing accidental bonding of reserved interfaces (e.g., future separation of Ceph storage traffic).
+
+## Consequences
+
+**Pros:**
+*   Precise, per-host configuration without polluting the Kubernetes API with hundreds of MachineConfigs.
+*   Standard Linux networking behavior; easy to debug locally.
+*   Prevents accidental interface capture (unlike wildcards).
+
+**Cons:**
+*   **Loss of Declarative K8s State:** The network config is not managed by the Machine Config Operator (MCO).
+*   **Node Replacement Friction:** Newly provisioned nodes (replacements) will boot with default config. Harmony must be run against new nodes manually or via a hook before they can fully join the cluster workload.
+
+## Alternatives considered
+
+1.  **Wildcard Matching in NetworkManager (e.g., `interface-name=enp*`):**
+    *   *Pros:* Single MachineConfig for the whole cluster.
+    *   *Cons:* Rejected because it is too broad. It risks capturing interfaces intended for other purposes (e.g., splitting storage and cluster networks later).
+
+2.  **"Kitchen Sink" Configuration:**
+    *   *Pros:* Single file listing every possible interface name as a slave.
+    *   *Cons:* "Dirty" configuration; results in many inactive connections on every host; brittle if new naming schemes appear.
+
+3.  **Per-Host MachineConfig:**
+    *   *Pros:* Fully declarative within OpenShift.
+    *   *Cons:* Requires a unique `MachineConfigPool` per host, which is an anti-pattern and unmaintainable at scale.
+
+4.  **On-boot Generation Script:**
+    *   *Pros:* Dynamic detection.
+    *   *Cons:* Increases boot complexity; harder to debug if the script fails during startup.
+
+## Additional Notes
+
+While `/etc` is writable and persistent on CoreOS, this configuration falls outside the "Day 1" Ignition process. Operational runbooks must be updated to ensure Harmony runs on any node replacement events.
--- a/adr/020-monitoring-alerting-architecture.md
+++ b/adr/020-monitoring-alerting-architecture.md
@@ -0,0 +1,318 @@
+# Architecture Decision Record: Monitoring and Alerting Architecture
+
+Initial Author: Willem Rolleman, Jean-Gabriel Carrier
+
+Initial Date: March 9, 2026
+
+Last Updated Date: March 9, 2026
+
+## Status
+
+Accepted
+
+Supersedes: [ADR-010](010-monitoring-and-alerting.md)
+
+## Context
+
+Harmony needs a unified approach to monitoring and alerting across different infrastructure targets:
+
+1. **Cluster-level monitoring**: Administrators managing entire Kubernetes/OKD clusters need to define cluster-wide alerts, receivers, and scrape targets.
+
+2. **Tenant-level monitoring**: Multi-tenant clusters where teams are confined to namespaces need monitoring scoped to their resources.
+
+3. **Application-level monitoring**: Developers deploying applications want zero-config monitoring that "just works" for their services.
+
+The monitoring landscape is fragmented:
+- **OKD/OpenShift**: Built-in Prometheus with AlertmanagerConfig CRDs
+- **KubePrometheus**: Helm-based stack with PrometheusRule CRDs
+- **RHOB (Red Hat Observability)**: Operator-based with MonitoringStack CRDs
+- **Standalone Prometheus**: Raw Prometheus deployments
+
+Each system has different CRDs, different installation methods, and different configuration APIs.
+
+## Decision
+
+We implement a **trait-based architecture with compile-time capability verification** that provides:
+
+1. **Type-safe abstractions** via parameterized traits: `AlertReceiver<S>`, `AlertRule<S>`, `ScrapeTarget<S>`
+2. **Compile-time topology compatibility** via the `Observability<S>` capability bound
+3. **Three levels of abstraction**: Cluster, Tenant, and Application monitoring
+4. **Pre-built alert rules** as functions that return typed structs
+
+### Core Traits
+
+```rust
+// domain/topology/monitoring.rs
+
+/// Marker trait for systems that send alerts (Prometheus, etc.)
+pub trait AlertSender: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> String;
+}
+
+/// Defines how a receiver (Discord, Slack, etc.) builds its configuration
+/// for a specific sender type
+pub trait AlertReceiver<S: AlertSender>: std::fmt::Debug + Send + Sync {
+    fn build(&self) -> Result<ReceiverInstallPlan, InterpretError>;
+    fn name(&self) -> String;
+    fn clone_box(&self) -> Box<dyn AlertReceiver<S>>;
+}
+
+/// Defines how an alert rule builds its PrometheusRule configuration
+pub trait AlertRule<S: AlertSender>: std::fmt::Debug + Send + Sync {
+    fn build_rule(&self) -> Result<serde_json::Value, InterpretError>;
+    fn name(&self) -> String;
+    fn clone_box(&self) -> Box<dyn AlertRule<S>>;
+}
+
+/// Capability that topologies implement to support monitoring
+pub trait Observability<S: AlertSender> {
+    async fn install_alert_sender(&self, sender: &S, inventory: &Inventory) 
+        -> Result<PreparationOutcome, PreparationError>;
+    async fn install_receivers(&self, sender: &S, inventory: &Inventory, 
+        receivers: Option<Vec<Box<dyn AlertReceiver<S>>>>) -> Result<...>;
+    async fn install_rules(&self, sender: &S, inventory: &Inventory,
+        rules: Option<Vec<Box<dyn AlertRule<S>>>>) -> Result<...>;
+    async fn add_scrape_targets(&self, sender: &S, inventory: &Inventory,
+        scrape_targets: Option<Vec<Box<dyn ScrapeTarget<S>>>>) -> Result<...>;
+    async fn ensure_monitoring_installed(&self, sender: &S, inventory: &Inventory) 
+        -> Result<...>;
+}
+```
+
+### Alert Sender Types
+
+Each monitoring stack is a distinct `AlertSender`:
+
+| Sender | Module | Use Case |
+|--------|--------|----------|
+| `OpenshiftClusterAlertSender` | `monitoring/okd/` | OKD/OpenShift built-in monitoring |
+| `KubePrometheus` | `monitoring/kube_prometheus/` | Helm-deployed kube-prometheus-stack |
+| `Prometheus` | `monitoring/prometheus/` | Standalone Prometheus via Helm |
+| `RedHatClusterObservability` | `monitoring/red_hat_cluster_observability/` | RHOB operator |
+| `Grafana` | `monitoring/grafana/` | Grafana-managed alerting |
+
+### Three Levels of Monitoring
+
+#### 1. Cluster-Level Monitoring
+
+For cluster administrators. Full control over monitoring infrastructure.
+
+```rust
+// examples/okd_cluster_alerts/src/main.rs
+OpenshiftClusterAlertScore {
+    sender: OpenshiftClusterAlertSender,
+    receivers: vec![Box::new(DiscordReceiver { ... })],
+    rules: vec![Box::new(alert_rules)],
+    scrape_targets: Some(vec![Box::new(external_exporters)]),
+}
+```
+
+**Characteristics:**
+- Cluster-scoped CRDs and resources
+- Can add external scrape targets (outside cluster)
+- Manages Alertmanager configuration
+- Requires cluster-admin privileges
+
+#### 2. Tenant-Level Monitoring
+
+For teams confined to namespaces. The topology determines tenant context.
+
+```rust
+// The topology's Observability impl handles namespace scoping
+impl Observability<KubePrometheus> for K8sAnywhereTopology {
+    async fn install_rules(&self, sender: &KubePrometheus, ...) {
+        // Topology knows if it's tenant-scoped
+        let namespace = self.get_tenant_config().await
+            .map(|t| t.name)
+            .unwrap_or("default");
+        // Install rules in tenant namespace
+    }
+}
+```
+
+**Characteristics:**
+- Namespace-scoped resources
+- Cannot modify cluster-level monitoring config
+- May have restricted receiver types
+- Runtime validation of permissions (cannot be fully compile-time)
+
+#### 3. Application-Level Monitoring
+
+For developers. Zero-config, opinionated monitoring.
+
+```rust
+// modules/application/features/monitoring.rs
+pub struct Monitoring {
+    pub application: Arc<dyn Application>,
+    pub alert_receiver: Vec<Box<dyn AlertReceiver<Prometheus>>>,
+}
+
+impl<T: Topology + Observability<Prometheus> + TenantManager + ...> 
+    ApplicationFeature<T> for Monitoring 
+{
+    async fn ensure_installed(&self, topology: &T) -> Result<...> {
+        // Auto-creates ServiceMonitor
+        // Auto-installs Ntfy for notifications
+        // Handles tenant namespace automatically
+        // Wires up sensible defaults
+    }
+}
+```
+
+**Characteristics:**
+- Automatic ServiceMonitor creation
+- Opinionated notification channel (Ntfy)
+- Tenant-aware via topology
+- Minimal configuration required
+
+## Rationale
+
+### Why Generic Traits Instead of Unified Types?
+
+Each monitoring stack (OKD, KubePrometheus, RHOB) has fundamentally different CRDs:
+
+```rust
+// OKD uses AlertmanagerConfig with different structure
+AlertmanagerConfig { spec: { receivers: [...] } }
+
+// RHOB uses secret references for webhook URLs
+MonitoringStack { spec: { alertmanagerConfig: { discordConfigs: [{ apiURL: { key: "..." } }] } } }
+
+// KubePrometheus uses Alertmanager CRD with different field names
+Alertmanager { spec: { config: { receivers: [...] } } }
+```
+
+A unified type would either:
+1. Be a lowest-common-denominator (loses stack-specific features)
+2. Be a complex union type (hard to use, easy to misconfigure)
+
+Generic traits let each stack express its configuration naturally while providing a consistent interface.
+
+### Why Compile-Time Capability Bounds?
+
+```rust
+impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T> 
+    for OpenshiftClusterAlertScore { ... }
+```
+
+This fails at compile time if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't support OKD monitoring. This prevents the "config-is-valid-but-platform-is-wrong" errors that Harmony was designed to eliminate.
+
+### Why Not a MonitoringStack Abstraction (V2 Approach)?
+
+The V2 approach proposed a unified `MonitoringStack` that hides sender selection:
+
+```rust
+// V2 approach - rejected
+MonitoringStack::new(MonitoringApiVersion::V2CRD)
+    .add_alert_channel(discord)
+```
+
+**Problems:**
+1. Hides which sender you're using, losing compile-time guarantees
+2. "Version selection" actually chooses between fundamentally different systems
+3. Would need to handle all stack-specific features through a generic interface
+
+The current approach is explicit: you choose `OpenshiftClusterAlertSender` and the compiler verifies compatibility.
+
+### Why Runtime Validation for Tenants?
+
+Tenant confinement is determined at runtime by the topology and K8s RBAC. We cannot know at compile time whether a user has cluster-admin or namespace-only access.
+
+Options considered:
+1. **Compile-time tenant markers** - Would require modeling entire RBAC hierarchy in types. Over-engineering.
+2. **Runtime validation** - Current approach. Fails with clear K8s permission errors if insufficient access.
+3. **No tenant support** - Would exclude a major use case.
+
+Runtime validation is the pragmatic choice. The failure mode is clear (K8s API error) and occurs early in execution.
+
+> Note : we will eventually have compile time validation for such things. Rust macros are powerful and we could discover the actual capabilities we're dealing with, similar to sqlx approach in query! macros.
+
+## Consequences
+
+### Pros
+
+1. **Type Safety**: Invalid configurations are caught at compile time
+2. **Extensibility**: Adding a new monitoring stack requires implementing traits, not modifying core code
+3. **Clear Separation**: Cluster/Tenant/Application levels have distinct entry points
+4. **Reusable Rules**: Pre-built alert rules as functions (`high_pvc_fill_rate_over_two_days()`)
+5. **CRD Accuracy**: Type definitions match actual Kubernetes CRDs exactly
+
+### Cons
+
+1. **Implementation Explosion**: `DiscordReceiver` implements `AlertReceiver<S>` for each sender type (3+ implementations)
+2. **Learning Curve**: Understanding the trait hierarchy takes time
+3. **clone_box Boilerplate**: Required for trait object cloning (3 lines per impl)
+
+### Mitigations
+
+- Implementation explosion is contained: each receiver type has O(senders) implementations, but receivers are rare compared to rules
+- Learning curve is documented with examples at each level
+- clone_box boilerplate is minimal and copy-paste
+
+## Alternatives Considered
+
+### Unified MonitoringStack Type
+
+See "Why Not a MonitoringStack Abstraction" above. Rejected for losing compile-time safety.
+
+### Helm-Only Approach
+
+Use `HelmScore` directly for each monitoring deployment. Rejected because:
+- No type safety for alert rules
+- Cannot compose with application features
+- No tenant awareness
+
+### Separate Modules Per Use Case
+
+Have `cluster_monitoring/`, `tenant_monitoring/`, `app_monitoring/` as separate modules. Rejected because:
+- Massive code duplication
+- No shared abstraction for receivers/rules
+- Adding a feature requires three implementations
+
+## Implementation Notes
+
+### Module Structure
+
+```
+modules/monitoring/
+├── mod.rs                     # Public exports
+├── alert_channel/             # Receivers (Discord, Webhook)
+├── alert_rule/                # Rules and pre-built alerts
+│   ├── prometheus_alert_rule.rs
+│   └── alerts/                # Library of pre-built rules
+│       ├── k8s/               # K8s-specific (pvc, pod, memory)
+│       └── infra/             # Infrastructure (opnsense, dell)
+├── okd/                       # OpenshiftClusterAlertSender
+├── kube_prometheus/           # KubePrometheus
+├── prometheus/                # Prometheus
+├── red_hat_cluster_observability/  # RHOB
+├── grafana/                   # Grafana
+├── application_monitoring/    # Application-level scores
+└── scrape_target/             # External scrape targets
+```
+
+### Adding a New Alert Sender
+
+1. Create sender type: `pub struct MySender; impl AlertSender for MySender { ... }`
+2. Implement `Observability<MySender>` for topologies that support it
+3. Create CRD types in `crd/` subdirectory
+4. Implement `AlertReceiver<MySender>` for existing receivers
+5. Implement `AlertRule<MySender>` for `AlertManagerRuleGroup`
+
+### Adding a New Alert Rule
+
+```rust
+pub fn my_custom_alert() -> PrometheusAlertRule {
+    PrometheusAlertRule::new("MyAlert", "up == 0")
+        .for_duration("5m")
+        .label("severity", "critical")
+        .annotation("summary", "Service is down")
+}
+```
+
+No trait implementation needed - `AlertManagerRuleGroup` already handles conversion.
+
+## Related ADRs
+
+- [ADR-013](013-monitoring-notifications.md): Notification channel selection (ntfy)
+- [ADR-011](011-multi-tenant-cluster.md): Multi-tenant cluster architecture
--- a/adr/020-monitoring-alerting-architecture/monitoring_v2/Cargo.toml
+++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "example-monitoring-v2"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony-k8s = { path = "../../harmony-k8s" }
+harmony_types = { path = "../../harmony_types" }
+kube = { workspace = true }
+schemars = "0.8"
+serde = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
+serde_yaml = { workspace = true }
+url = { workspace = true }
+log = { workspace = true }
+async-trait = { workspace = true }
+k8s-openapi = { workspace = true }
--- a/adr/020-monitoring-alerting-architecture/monitoring_v2/README.md
+++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/README.md
@@ -0,0 +1,91 @@
+# Monitoring v2 - Improved Architecture
+
+This example demonstrates the improved monitoring architecture that addresses the "WTF/minute" issues in the original design.
+
+## Key Improvements
+
+### 1. **Single AlertChannel Trait with Generic Sender**
+
+The original design required 9-12 implementations for each alert channel (Discord, Webhook, etc.) - one for each sender type. The new design uses a single trait with generic sender parameterization:
+
+pub trait AlertChannel<Sender: AlertSender> {
+    async fn install_config(&self, sender: &Sender) -> Result<Outcome, InterpretError>;
+    fn name(&self) -> String;
+    fn as_any(&self) -> &dyn std::any::Any;
+}
+
+**Benefits:**
+- One Discord implementation works with all sender types
+- Type safety at compile time
+- No runtime dispatch overhead
+
+### 2. **MonitoringStack Abstraction**
+
+Instead of manually selecting CRDPrometheus vs KubePrometheus vs RHOBObservability, you now have a unified MonitoringStack that handles versioning:
+
+let monitoring_stack = MonitoringStack::new(MonitoringApiVersion::V2CRD)
+    .set_namespace("monitoring")
+    .add_alert_channel(discord_receiver)
+    .set_scrape_targets(vec![...]);
+
+**Benefits:**
+- Single source of truth for monitoring configuration
+- Easy to switch between monitoring versions
+- Automatic version-specific configuration
+
+### 3. **TenantMonitoringScore - True Composition**
+
+The original monitoring_with_tenant example just put tenant and monitoring as separate items in a vec. The new design truly composes them:
+
+let tenant_score = TenantMonitoringScore::new("test-tenant", monitoring_stack);
+
+This creates a single score that:
+- Has tenant context
+- Has monitoring configuration
+- Automatically installs monitoring scoped to tenant namespace
+
+**Benefits:**
+- No more "two separate things" confusion
+- Automatic tenant namespace scoping
+- Clear ownership: tenant owns its monitoring
+
+### 4. **Versioned Monitoring APIs**
+
+Clear versioning makes it obvious which monitoring stack you're using:
+
+pub enum MonitoringApiVersion {
+    V1Helm,    // Old Helm charts
+    V2CRD,     // Current CRDs
+    V3RHOB,    // RHOB (future)
+}
+
+**Benefits:**
+- No guessing which API version you're using
+- Easy to migrate between versions
+- Backward compatibility path
+
+## Comparison
+
+### Original Design (monitoring_with_tenant)
+- Manual selection of each component
+- Manual installation of both components
+- Need to remember to pass both to harmony_cli::run
+- Monitoring not scoped to tenant automatically
+
+### New Design (monitoring_v2)
+- Single composed score
+- One score does it all
+
+## Usage
+
+cd examples/monitoring_v2
+cargo run
+
+## Migration Path
+
+To migrate from the old design to the new:
+
+1. Replace individual alert channel implementations with AlertChannel<Sender>
+2. Use MonitoringStack instead of manual *Prometheus selection
+3. Use TenantMonitoringScore instead of separate TenantScore + monitoring scores
+4. Select monitoring version via MonitoringApiVersion
--- a/adr/020-monitoring-alerting-architecture/monitoring_v2/src/lib.rs
+++ b/adr/020-monitoring-alerting-architecture/monitoring_v2/src/lib.rs
@@ -0,0 +1,343 @@
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+
+
+use log::debug;
+use serde::{Deserialize, Serialize};
+use serde_yaml::{Mapping, Value};
+
+use harmony::data::Version;
+use harmony::interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome};
+use harmony::inventory::Inventory;
+use harmony::score::Score;
+use harmony::topology::{Topology, tenant::TenantManager};
+
+use harmony_k8s::K8sClient;
+use harmony_types::k8s_name::K8sName;
+use harmony_types::net::Url;
+
+pub trait AlertSender: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> String;
+    fn namespace(&self) -> String;
+}
+
+#[derive(Debug)]
+pub struct CRDPrometheus {
+    pub namespace: String,
+    pub client: Arc<K8sClient>,
+}
+
+impl AlertSender for CRDPrometheus {
+    fn name(&self) -> String {
+        "CRDPrometheus".to_string()
+    }
+
+    fn namespace(&self) -> String {
+        self.namespace.clone()
+    }
+}
+
+#[derive(Debug)]
+pub struct RHOBObservability {
+    pub namespace: String,
+    pub client: Arc<K8sClient>,
+}
+
+impl AlertSender for RHOBObservability {
+    fn name(&self) -> String {
+        "RHOBObservability".to_string()
+    }
+
+    fn namespace(&self) -> String {
+        self.namespace.clone()
+    }
+}
+
+#[derive(Debug)]
+pub struct KubePrometheus {
+    pub config: Arc<Mutex<KubePrometheusConfig>>,
+}
+
+impl Default for KubePrometheus {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl KubePrometheus {
+    pub fn new() -> Self {
+        Self {
+            config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
+        }
+    }
+}
+
+impl AlertSender for KubePrometheus {
+    fn name(&self) -> String {
+        "KubePrometheus".to_string()
+    }
+
+    fn namespace(&self) -> String {
+        self.config.lock().unwrap().namespace.clone().unwrap_or_else(|| "monitoring".to_string())
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct KubePrometheusConfig {
+    pub namespace: Option<String>,
+    #[serde(skip)]
+    pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
+}
+
+impl KubePrometheusConfig {
+    pub fn new() -> Self {
+        Self {
+            namespace: None,
+            alert_receiver_configs: Vec::new(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AlertManagerChannelConfig {
+    pub channel_receiver: serde_yaml::Value,
+    pub channel_route: serde_yaml::Value,
+}
+
+impl Default for AlertManagerChannelConfig {
+    fn default() -> Self {
+        Self {
+            channel_receiver: serde_yaml::Value::Mapping(Default::default()),
+            channel_route: serde_yaml::Value::Mapping(Default::default()),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ScrapeTargetConfig {
+    pub service_name: String,
+    pub port: String,
+    pub path: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum MonitoringApiVersion {
+    V1Helm,
+    V2CRD,
+    V3RHOB,
+}
+
+#[derive(Debug, Clone)]
+pub struct MonitoringStack {
+    pub version: MonitoringApiVersion,
+    pub namespace: String,
+    pub alert_channels: Vec<Arc<dyn AlertSender>>,
+    pub scrape_targets: Vec<ScrapeTargetConfig>,
+}
+
+impl MonitoringStack {
+    pub fn new(version: MonitoringApiVersion) -> Self {
+        Self {
+            version,
+            namespace: "monitoring".to_string(),
+            alert_channels: Vec::new(),
+            scrape_targets: Vec::new(),
+        }
+    }
+
+    pub fn set_namespace(mut self, namespace: &str) -> Self {
+        self.namespace = namespace.to_string();
+        self
+    }
+
+    pub fn add_alert_channel(mut self, channel: impl AlertSender + 'static) -> Self {
+        self.alert_channels.push(Arc::new(channel));
+        self
+    }
+
+    pub fn set_scrape_targets(mut self, targets: Vec<(&str, &str, String)>) -> Self {
+        self.scrape_targets = targets
+            .into_iter()
+            .map(|(name, port, path)| ScrapeTargetConfig {
+                service_name: name.to_string(),
+                port: port.to_string(),
+                path,
+            })
+            .collect();
+        self
+    }
+}
+
+pub trait AlertChannel<Sender: AlertSender> {
+    fn install_config(&self, sender: &Sender);
+    fn name(&self) -> String;
+}
+
+#[derive(Debug, Clone)]
+pub struct DiscordWebhook {
+    pub name: K8sName,
+    pub url: Url,
+    pub selectors: Vec<HashMap<String, String>>,
+}
+
+impl DiscordWebhook {
+    fn get_config(&self) -> AlertManagerChannelConfig {
+        let mut route = Mapping::new();
+        route.insert(
+            Value::String("receiver".to_string()),
+            Value::String(self.name.to_string()),
+        );
+        route.insert(
+            Value::String("matchers".to_string()),
+            Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
+        );
+
+        let mut receiver = Mapping::new();
+        receiver.insert(
+            Value::String("name".to_string()),
+            Value::String(self.name.to_string()),
+        );
+
+        let mut discord_config = Mapping::new();
+        discord_config.insert(
+            Value::String("webhook_url".to_string()),
+            Value::String(self.url.to_string()),
+        );
+
+        receiver.insert(
+            Value::String("discord_configs".to_string()),
+            Value::Sequence(vec![Value::Mapping(discord_config)]),
+        );
+
+        AlertManagerChannelConfig {
+            channel_receiver: Value::Mapping(receiver),
+            channel_route: Value::Mapping(route),
+        }
+    }
+}
+
+impl AlertChannel<CRDPrometheus> for DiscordWebhook {
+    fn install_config(&self, sender: &CRDPrometheus) {
+        debug!("Installing Discord webhook for CRDPrometheus in namespace: {}", sender.namespace());
+        debug!("Config: {:?}", self.get_config());
+        debug!("Installed!");
+    }
+
+    fn name(&self) -> String {
+        "discord-webhook".to_string()
+    }
+}
+
+impl AlertChannel<RHOBObservability> for DiscordWebhook {
+    fn install_config(&self, sender: &RHOBObservability) {
+        debug!("Installing Discord webhook for RHOBObservability in namespace: {}", sender.namespace());
+        debug!("Config: {:?}", self.get_config());
+        debug!("Installed!");
+    }
+
+    fn name(&self) -> String {
+        "webhook-receiver".to_string()
+    }
+}
+
+impl AlertChannel<KubePrometheus> for DiscordWebhook {
+    fn install_config(&self, sender: &KubePrometheus) {
+        debug!("Installing Discord webhook for KubePrometheus in namespace: {}", sender.namespace());
+        let config = sender.config.lock().unwrap();
+        let ns = config.namespace.clone().unwrap_or_else(|| "monitoring".to_string());
+        debug!("Namespace: {}", ns);
+        let mut config = sender.config.lock().unwrap();
+        config.alert_receiver_configs.push(self.get_config());
+        debug!("Installed!");
+    }
+
+    fn name(&self) -> String {
+        "discord-webhook".to_string()
+    }
+}
+
+fn default_monitoring_stack() -> MonitoringStack {
+    MonitoringStack::new(MonitoringApiVersion::V2CRD)
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TenantMonitoringScore {
+    pub tenant_id: harmony_types::id::Id,
+    pub tenant_name: String,
+    #[serde(skip)]
+    #[serde(default = "default_monitoring_stack")]
+    pub monitoring_stack: MonitoringStack,
+}
+
+impl TenantMonitoringScore {
+    pub fn new(tenant_name: &str, monitoring_stack: MonitoringStack) -> Self {
+        Self {
+            tenant_id: harmony_types::id::Id::default(),
+            tenant_name: tenant_name.to_string(),
+            monitoring_stack,
+        }
+    }
+}
+
+impl<T: Topology + TenantManager> Score<T> for TenantMonitoringScore {
+    fn create_interpret(&self) -> Box<dyn Interpret<T>> {
+        Box::new(TenantMonitoringInterpret {
+            score: self.clone(),
+        })
+    }
+
+    fn name(&self) -> String {
+        format!("{} monitoring [TenantMonitoringScore]", self.tenant_name)
+    }
+}
+
+#[derive(Debug)]
+pub struct TenantMonitoringInterpret {
+    pub score: TenantMonitoringScore,
+}
+
+#[async_trait::async_trait]
+impl<T: Topology + TenantManager> Interpret<T> for TenantMonitoringInterpret {
+    async fn execute(
+        &self,
+        _inventory: &Inventory,
+        topology: &T,
+    ) -> Result<Outcome, InterpretError> {
+        let tenant_config = topology.get_tenant_config().await.unwrap();
+        let tenant_ns = tenant_config.name.clone();
+
+        match self.score.monitoring_stack.version {
+            MonitoringApiVersion::V1Helm => {
+                debug!("Installing Helm monitoring for tenant {}", tenant_ns);
+            }
+            MonitoringApiVersion::V2CRD => {
+                debug!("Installing CRD monitoring for tenant {}", tenant_ns);
+            }
+            MonitoringApiVersion::V3RHOB => {
+                debug!("Installing RHOB monitoring for tenant {}", tenant_ns);
+            }
+        }
+
+        Ok(Outcome::success(format!(
+            "Installed monitoring stack for tenant {} with version {:?}",
+            self.score.tenant_name,
+            self.score.monitoring_stack.version
+        )))
+    }
+
+    fn get_name(&self) -> InterpretName {
+        InterpretName::Custom("TenantMonitoringInterpret")
+    }
+
+    fn get_version(&self) -> Version {
+        Version::from("1.0.0").unwrap()
+    }
+
+    fn get_status(&self) -> InterpretStatus {
+        InterpretStatus::SUCCESS
+    }
+
+    fn get_children(&self) -> Vec<harmony_types::id::Id> {
+        Vec::new()
+    }
+}
--- a/brocade/Cargo.toml
+++ b/brocade/Cargo.toml
@@ -16,3 +16,4 @@ env_logger.workspace = true
 regex = "1.11.3"
 harmony_secret = { path = "../harmony_secret" }
 serde.workspace = true
+schemars = "0.8"
--- a/brocade/examples/main.rs
+++ b/brocade/examples/main.rs
@@ -3,9 +3,10 @@ use std::net::{IpAddr, Ipv4Addr};
 use brocade::{BrocadeOptions, ssh};
 use harmony_secret::{Secret, SecretManager};
 use harmony_types::switch::PortLocation;
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};

-#[derive(Secret, Clone, Debug, Serialize, Deserialize)]
+#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
 struct BrocadeSwitchAuth {
    username: String,
    password: String,
@@ -20,17 +21,15 @@ async fn main() {
    // let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 4, 11)); // brocade @ st
    let switch_addresses = vec![ip];

-    // let config = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
-    //     .await
-    //     .unwrap();
+    let config = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
+        .await
+        .unwrap();

    let brocade = brocade::init(
        &switch_addresses,
-        // &config.username,
-        // &config.password,
-        "admin",
-        "password",
-        BrocadeOptions {
+        &config.username,
+        &config.password,
+        &BrocadeOptions {
            dry_run: true,
            ssh: ssh::SshOptions {
                port: 2222,
--- a/brocade/src/fast_iron.rs
+++ b/brocade/src/fast_iron.rs
@@ -1,8 +1,7 @@
 use super::BrocadeClient;
 use crate::{
    BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
-    PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
-    shell::BrocadeShell,
+    PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell,
 };

 use async_trait::async_trait;
--- a/brocade/src/lib.rs
+++ b/brocade/src/lib.rs
@@ -144,7 +144,7 @@ pub async fn init(
    ip_addresses: &[IpAddr],
    username: &str,
    password: &str,
-    options: BrocadeOptions,
+    options: &BrocadeOptions,
 ) -> Result<Box<dyn BrocadeClient + Send + Sync>, Error> {
    let shell = BrocadeShell::init(ip_addresses, username, password, options).await?;

--- a/brocade/src/network_operating_system.rs
+++ b/brocade/src/network_operating_system.rs
@@ -8,7 +8,7 @@ use regex::Regex;
 use crate::{
    BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
    InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
-    SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
+    parse_brocade_mac_address, shell::BrocadeShell,
 };

 #[derive(Debug)]
--- a/brocade/src/shell.rs
+++ b/brocade/src/shell.rs
@@ -28,7 +28,7 @@ impl BrocadeShell {
        ip_addresses: &[IpAddr],
        username: &str,
        password: &str,
-        options: BrocadeOptions,
+        options: &BrocadeOptions,
    ) -> Result<Self, Error> {
        let ip = ip_addresses
            .first()
--- a/brocade/src/ssh.rs
+++ b/brocade/src/ssh.rs
@@ -70,7 +70,7 @@ pub async fn try_init_client(
    username: &str,
    password: &str,
    ip: &std::net::IpAddr,
-    base_options: BrocadeOptions,
+    base_options: &BrocadeOptions,
 ) -> Result<BrocadeOptions, Error> {
    let mut default = SshOptions::default();
    default.port = base_options.ssh.port;
--- a/docs/README.md
+++ b/docs/README.md
@@ -1 +1,46 @@
-Not much here yet, see the `adr` folder for now. More to come in time!
+# Harmony Documentation Hub
+
+Welcome to the Harmony documentation. This is the main entry point for learning everything from core concepts to building your own Score, Topologies, and Capabilities.
+
+## 1. Getting Started
+
+If you're new to Harmony, start here:
+
+- [**Getting Started Guide**](./guides/getting-started.md): A step-by-step tutorial that takes you from an empty project to deploying your first application.
+- [**Core Concepts**](./concepts.md): A high-level overview of the key concepts in Harmony: `Score`, `Topology`, `Capability`, `Inventory`, `Interpret`, ...
+
+## 2. Use Cases & Examples
+
+See how to use Harmony to solve real-world problems.
+
+- [**OKD on Bare Metal**](./use-cases/okd-on-bare-metal.md): A detailed walkthrough of bootstrapping a high-availability OKD cluster from physical hardware.
+- [**Deploy a Rust Web App**](./use-cases/deploy-rust-webapp.md): A quick guide to deploying a monitored, containerized web application to a Kubernetes cluster.
+
+## 3. Component Catalogs
+
+Discover existing, reusable components you can use in your Harmony projects.
+
+- [**Scores Catalog**](./catalogs/scores.md): A categorized list of all available `Scores` (the "what").
+- [**Topologies Catalog**](./catalogs/topologies.md): A list of all available `Topologies` (the "where").
+- [**Capabilities Catalog**](./catalogs/capabilities.md): A list of all available `Capabilities` (the "how").
+
+## 4. Developer Guides
+
+Ready to build your own components? These guides show you how.
+
+- [**Writing a Score**](./guides/writing-a-score.md): Learn how to create your own `Score` and `Interpret` logic to define a new desired state.
+- [**Writing a Topology**](./guides/writing-a-topology.md): Learn how to model a new environment (like AWS, GCP, or custom hardware) as a `Topology`.
+- [**Adding Capabilities**](./guides/adding-capabilities.md): See how to add a `Capability` to your custom `Topology`.
+- [**Coding Guide**](./coding-guide.md): Conventions and best practices for writing Harmony code.
+
+## 5. Module Documentation
+
+Deep dives into specific Harmony modules and features.
+
+- [**Monitoring and Alerting**](./monitoring.md): Comprehensive guide to cluster, tenant, and application-level monitoring with support for OKD, KubePrometheus, RHOB, and more.
+
+## 6. Architecture Decision Records
+
+Important architectural decisions are documented in the `adr/` directory:
+
+- [Full ADR Index](../adr/)
--- a/docs/catalogs/README.md
+++ b/docs/catalogs/README.md
@@ -0,0 +1,7 @@
+# Component Catalogs
+
+This section is the "dictionary" for Harmony. It lists all the reusable components available out-of-the-box.
+
+- [**Scores Catalog**](./scores.md): Discover all available `Scores` (the "what").
+- [**Topologies Catalog**](./topologies.md): A list of all available `Topologies` (the "where").
+- [**Capabilities Catalog**](./capabilities.md): A list of all available `Capabilities` (the "how").
--- a/docs/catalogs/capabilities.md
+++ b/docs/catalogs/capabilities.md
@@ -0,0 +1,40 @@
+# Capabilities Catalog
+
+A `Capability` is a specific feature or API that a `Topology` offers. `Interpret` logic uses these capabilities to execute a `Score`.
+
+This list is primarily for developers **writing new Topologies or Scores**. As a user, you just need to know that the `Topology` you pick (like `K8sAnywhereTopology`) provides the capabilities your `Scores` (like `ApplicationScore`) need.
+
+<!--toc:start-->
+
+- [Capabilities Catalog](#capabilities-catalog)
+  - [Kubernetes & Application](#kubernetes-application)
+  - [Monitoring & Observability](#monitoring-observability)
+  - [Networking (Core Services)](#networking-core-services)
+  - [Networking (Hardware & Host)](#networking-hardware-host)
+
+<!--toc:end-->
+
+## Kubernetes & Application
+
+- **K8sClient**: Provides an authenticated client to interact with a Kubernetes API (create/read/update/delete resources).
+- **HelmCommand**: Provides the ability to execute Helm commands (install, upgrade, template).
+- **TenantManager**: Provides methods for managing tenants in a multi-tenant cluster.
+- **Ingress**: Provides an interface for managing ingress controllers and resources.
+
+## Monitoring & Observability
+
+- **Grafana**: Provides an API for configuring Grafana (datasources, dashboards).
+- **Monitoring**: A general capability for configuring monitoring (e.g., creating Prometheus rules).
+
+## Networking (Core Services)
+
+- **DnsServer**: Provides an interface for creating and managing DNS records.
+- **LoadBalancer**: Provides an interface for configuring a load balancer (e.g., OPNsense, MetalLB).
+- **DhcpServer**: Provides an interface for managing DHCP leases and host bindings.
+- **TftpServer**: Provides an interface for managing files on a TFTP server (e.g., iPXE boot files).
+
+## Networking (Hardware & Host)
+
+- **Router**: Provides an interface for configuring routing rules, typically on a firewall like OPNsense.
+- **Switch**: Provides an interface for configuring a physical network switch (e.g., managing VLANs and port channels).
+- **NetworkManager**: Provides an interface for configuring host-level networking (e.g., creating bonds and bridges on a node).
--- a/docs/catalogs/scores.md
+++ b/docs/catalogs/scores.md
@@ -0,0 +1,102 @@
+# Scores Catalog
+
+A `Score` is a declarative description of a desired state. Find the Score you need and add it to your `harmony!` block's `scores` array.
+
+<!--toc:start-->
+
+- [Scores Catalog](#scores-catalog)
+  - [Application Deployment](#application-deployment)
+  - [OKD / Kubernetes Cluster Setup](#okd-kubernetes-cluster-setup)
+  - [Cluster Services & Management](#cluster-services-management)
+  - [Monitoring & Alerting](#monitoring-alerting)
+  - [Infrastructure & Networking (Bare Metal)](#infrastructure-networking-bare-metal)
+  - [Infrastructure & Networking (Cluster)](#infrastructure-networking-cluster)
+  - [Tenant Management](#tenant-management)
+  - [Utility](#utility)
+
+<!--toc:end-->
+
+## Application Deployment
+
+Scores for deploying and managing end-user applications.
+
+- **ApplicationScore**: The primary score for deploying a web application. Describes the application, its framework, and the features it requires (e.g., monitoring, CI/CD).
+- **HelmChartScore**: Deploys a generic Helm chart to a Kubernetes cluster.
+- **ArgoHelmScore**: Deploys an application using an ArgoCD Helm chart.
+- **LAMPScore**: A specialized score for deploying a classic LAMP (Linux, Apache, MySQL, PHP) stack.
+
+## OKD / Kubernetes Cluster Setup
+
+This collection of Scores is used to provision an entire OKD cluster from bare metal. They are typically used in order.
+
+- **OKDSetup01InventoryScore**: Discovers and catalogs the physical hardware.
+- **OKDSetup02BootstrapScore**: Configures the bootstrap node, renders iPXE files, and kicks off the SCOS installation.
+- **OKDSetup03ControlPlaneScore**: Renders iPXE configurations for the control plane nodes.
+- **OKDSetupPersistNetworkBondScore**: Configures network bonds on the nodes and port channels on the switches.
+- **OKDSetup04WorkersScore**: Renders iPXE configurations for the worker nodes.
+- **OKDSetup06InstallationReportScore**: Runs post-installation checks and generates a report.
+- **OKDUpgradeScore**: Manages the upgrade process for an existing OKD cluster.
+
+## Cluster Services & Management
+
+Scores for installing and managing services _inside_ a Kubernetes cluster.
+
+- **K3DInstallationScore**: Installs and configes a local K3D (k3s-in-docker) cluster. Used by `K8sAnywhereTopology`.
+- **CertManagerHelmScore**: Deploys the `cert-manager` Helm chart.
+- **ClusterIssuerScore**: Configures a `ClusterIssuer` for `cert-manager`, (e.g., for Let's Encrypt).
+- **K8sNamespaceScore**: Ensures a Kubernetes namespace exists.
+- **K8sDeploymentScore**: Deploys a generic `Deployment` resource to Kubernetes.
+- **K8sIngressScore**: Configures an `Ingress` resource for a service.
+
+## Monitoring & Alerting
+
+Scores for configuring observability, dashboards, and alerts.
+
+- **ApplicationMonitoringScore**: A generic score to set up monitoring for an application.
+- **ApplicationRHOBMonitoringScore**: A specialized score for setting up monitoring via the Red Hat Observability stack.
+- **HelmPrometheusAlertingScore**: Configures Prometheus alerts via a Helm chart.
+- **K8sPrometheusCRDAlertingScore**: Configures Prometheus alerts using the `PrometheusRule` CRD.
+- **PrometheusAlertScore**: A generic score for creating a Prometheus alert.
+- **RHOBAlertingScore**: Configures alerts specifically for the Red Hat Observability stack.
+- **NtfyScore**: Configures alerts to be sent to a `ntfy.sh` server.
+
+## Infrastructure & Networking (Bare Metal)
+
+Low-level scores for managing physical hardware and network services.
+
+- **DhcpScore**: Configures a DHCP server.
+- **OKDDhcpScore**: A specialized DHCP configuration for the OKD bootstrap process.
+- **OKDBootstrapDhcpScore**: Configures DHCP specifically for the bootstrap node.
+- **DhcpHostBindingScore**: Creates a specific MAC-to-IP binding in the DHCP server.
+- **DnsScore**: Configures a DNS server.
+- **OKDDnsScore**: A specialized DNS configuration for the OKD cluster (e.g., `api.*`, `*.apps.*`).
+- **StaticFilesHttpScore**: Serves a directory of static files (e.g., a documentation site) over HTTP.
+- **TftpScore**: Configures a TFTP server, typically for serving iPXE boot files.
+- **IPxeMacBootFileScore**: Assigns a specific iPXE boot file to a MAC address in the TFTP server.
+- **OKDIpxeScore**: A specialized score for generating the iPXE boot scripts for OKD.
+- **OPNsenseShellCommandScore**: Executes a shell command on an OPNsense firewall.
+
+## Infrastructure & Networking (Cluster)
+
+Network services that run inside the cluster or as part of the topology.
+
+- **LoadBalancerScore**: Configures a general-purpose load balancer.
+- **OKDLoadBalancerScore**: Configures the high-availability load balancers for the OKD API and ingress.
+- **OKDBootstrapLoadBalancerScore**: Configures the load balancer specifically for the bootstrap-time API endpoint.
+- **K8sIngressScore**: Configures an Ingress controller or resource.
+- [HighAvailabilityHostNetworkScore](../../harmony/src/modules/okd/host_network.rs): Configures network bonds on a host and the corresponding port-channels on the switch stack for high-availability.
+
+## Tenant Management
+
+Scores for managing multi-tenancy within a cluster.
+
+- **TenantScore**: Creates a new tenant (e.g., a namespace, quotas, network policies).
+- **TenantCredentialScore**: Generates and provisions credentials for a new tenant.
+
+## Utility
+
+Helper scores for discovery and inspection.
+
+- **LaunchDiscoverInventoryAgentScore**: Launches the agent responsible for the `OKDSetup01InventoryScore`.
+- **DiscoverHostForRoleScore**: A utility score to find a host matching a specific role in the inventory.
+- **InspectInventoryScore**: Dumps the discovered inventory for inspection.
--- a/docs/catalogs/topologies.md
+++ b/docs/catalogs/topologies.md
@@ -0,0 +1,59 @@
+# Topologies Catalog
+
+A `Topology` is the logical representation of your infrastructure and its `Capabilities`. You select a `Topology` in your Harmony project to define _where_ your `Scores` will be applied.
+
+<!--toc:start-->
+
+- [Topologies Catalog](#topologies-catalog)
+  - [HAClusterTopology](#haclustertopology)
+  - [K8sAnywhereTopology](#k8sanywheretopology)
+
+<!--toc:end-->
+
+### HAClusterTopology
+
+- **`HAClusterTopology::autoload()`**
+
+This `Topology` represents a high-availability, bare-metal cluster. It is designed for production-grade deployments like OKD.
+
+It models an environment consisting of:
+
+- At least 3 cluster nodes (for control plane/workers)
+- 2 redundant firewalls (e.g., OPNsense)
+- 2 redundant network switches
+
+**Provided Capabilities:**
+This topology provides a rich set of capabilities required for bare-metal provisioning and cluster management, including:
+
+- `K8sClient` (once the cluster is bootstrapped)
+- `DnsServer`
+- `LoadBalancer`
+- `DhcpServer`
+- `TftpServer`
+- `Router` (via the firewalls)
+- `Switch`
+- `NetworkManager` (for host-level network config)
+
+---
+
+### K8sAnywhereTopology
+
+- **`K8sAnywhereTopology::from_env()`**
+
+This `Topology` is designed for development and application deployment. It provides a simple, abstract way to deploy to _any_ Kubernetes cluster.
+
+**How it works:**
+
+1. By default (`from_env()` with no env vars), it automatically provisions a **local K3D (k3s-in-docker) cluster** on your machine. This is perfect for local development and testing.
+2. If you provide a `KUBECONFIG` environment variable, it will instead connect to that **existing Kubernetes cluster** (e.g., your staging or production OKD cluster).
+
+This allows you to use the _exact same code_ to deploy your application locally as you do to deploy it to production.
+
+**Provided Capabilities:**
+
+- `K8sClient`
+- `HelmCommand`
+- `TenantManager`
+- `Ingress`
+- `Monitoring`
+- ...and more.
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -0,0 +1,40 @@
+# Core Concepts
+
+Harmony's design is based on a few key concepts. Understanding them is the key to unlocking the framework's power.
+
+### 1. Score
+
+- **What it is:** A **Score** is a declarative description of a desired state. It's a "resource" that defines _what_ you want to achieve, not _how_ to do it.
+- **Example:** `ApplicationScore` declares "I want this web application to be running and monitored."
+
+### 2. Topology
+
+- **What it is:** A **Topology** is the logical representation of your infrastructure and its abilities. It's the "where" your Scores will be applied.
+- **Key Job:** A Topology's most important job is to expose which `Capabilities` it supports.
+- **Example:** `HAClusterTopology` represents a bare-metal cluster and exposes `Capabilities` like `NetworkManager` and `Switch`. `K8sAnywhereTopology` represents a Kubernetes cluster and exposes the `K8sClient` `Capability`.
+
+### 3. Capability
+
+- **What it is:** A **Capability** is a specific feature or API that a `Topology` offers. It's the "how" a `Topology` can fulfill a `Score`'s request.
+- **Example:** The `K8sClient` capability offers a way to interact with a Kubernetes API. The `Switch` capability offers a way to configure a physical network switch.
+
+### 4. Interpret
+
+- **What it is:** An **Interpret** is the execution logic that makes a `Score` a reality. It's the "glue" that connects the _desired state_ (`Score`) to the _environment's abilities_ (`Topology`'s `Capabilities`).
+- **How it works:** When you apply a `Score`, Harmony finds the matching `Interpret` for your `Topology`. This `Interpret` then uses the `Capabilities` provided by the `Topology` to execute the necessary steps.
+
+### 5. Inventory
+
+- **What it is:** An **Inventory** is the physical material (the "what") used in a cluster. This is most relevant for bare-metal or on-premise topologies.
+- **Example:** A list of nodes with their roles (control plane, worker), CPU, RAM, and network interfaces. For the `K8sAnywhereTopology`, the inventory might be empty or autoloaded, as the infrastructure is more abstract.
+
+---
+
+### How They Work Together (The Compile-Time Check)
+
+1. You **write a `Score`** (e.g., `ApplicationScore`).
+2. Your `Score`'s `Interpret` logic requires certain **`Capabilities`** (e.g., `K8sClient` and `Ingress`).
+3. You choose a **`Topology`** to run it on (e.g., `HAClusterTopology`).
+4. **At compile-time**, Harmony checks: "Does `HAClusterTopology` provide the `K8sClient` and `Ingress` capabilities that `ApplicationScore` needs?"
+   - **If Yes:** Your code compiles. You can be confident it will run.
+   - **If No:** The compiler gives you an error. You've just prevented a "config-is-valid-but-platform-is-wrong" runtime error before you even deployed.
--- a/docs/guides/getting-started.md
+++ b/docs/guides/getting-started.md
@@ -0,0 +1,42 @@
+# Getting Started Guide
+
+Welcome to Harmony! This guide will walk you through installing the Harmony framework, setting up a new project, and deploying your first application.
+
+We will build and deploy the "Rust Web App" example, which automatically:
+
+1. Provisions a local K3D (Kubernetes in Docker) cluster.
+2. Deploys a sample Rust web application.
+3. Sets up monitoring for the application.
+
+## Prerequisites
+
+Before you begin, you'll need a few tools installed on your system:
+
+- **Rust & Cargo:** [Install Rust](https://www.rust-lang.org/tools/install)
+- **Docker:** [Install Docker](https://docs.docker.com/get-docker/) (Required for the K3D local cluster)
+- **kubectl:** [Install kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For inspecting the cluster)
+
+## 1. Install Harmony
+
+First, clone the Harmony repository and build the project. This gives you the `harmony` CLI and all the core libraries.
+
+```bash
+# Clone the main repository
+git clone https://git.nationtech.io/nationtech/harmony
+cd harmony
+
+# Build the project (this may take a few minutes)
+cargo build --release
+```
+
+...
+
+## Next Steps
+
+Congratulations, you've just deployed an application using true infrastructure-as-code!
+
+From here, you can:
+
+- [Explore the Catalogs](../catalogs/README.md): See what other [Scores](../catalogs/scores.md) and [Topologies](../catalogs/topologies.md) are available.
+- [Read the Use Cases](../use-cases/README.md): Check out the [OKD on Bare Metal](./use-cases/okd-on-bare-metal.md) guide for a more advanced scenario.
+- [Write your own Score](../guides/writing-a-score.md): Dive into the [Developer Guide](./guides/developer-guide.md) to start building your own components.
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -0,0 +1,443 @@
+# Monitoring and Alerting in Harmony
+
+Harmony provides a unified, type-safe approach to monitoring and alerting across Kubernetes, OpenShift, and bare-metal infrastructure. This guide explains the architecture and how to use it at different levels of abstraction.
+
+## Overview
+
+Harmony's monitoring module supports three distinct use cases:
+
+| Level | Who Uses It | What It Provides |
+|-------|-------------|------------------|
+| **Cluster** | Cluster administrators | Full control over monitoring stack, cluster-wide alerts, external scrape targets |
+| **Tenant** | Platform teams | Namespace-scoped monitoring in multi-tenant environments |
+| **Application** | Application developers | Zero-config monitoring that "just works" |
+
+Each level builds on the same underlying abstractions, ensuring consistency while providing appropriate complexity for each audience.
+
+## Core Concepts
+
+### AlertSender
+
+An `AlertSender` represents the system that evaluates alert rules and sends notifications. Harmony supports multiple monitoring stacks:
+
+| Sender | Description | Use When |
+|--------|-------------|----------|
+| `OpenshiftClusterAlertSender` | OKD/OpenShift built-in monitoring | Running on OKD/OpenShift |
+| `KubePrometheus` | kube-prometheus-stack via Helm | Standard Kubernetes, need full stack |
+| `Prometheus` | Standalone Prometheus | Custom Prometheus deployment |
+| `RedHatClusterObservability` | RHOB operator | Red Hat managed clusters |
+| `Grafana` | Grafana-managed alerting | Grafana as primary alerting layer |
+
+### AlertReceiver
+
+An `AlertReceiver` defines where alerts are sent (Discord, Slack, email, webhook, etc.). Receivers are parameterized by sender type because each monitoring stack has different configuration formats.
+
+```rust
+pub trait AlertReceiver<S: AlertSender> {
+    fn build(&self) -> Result<ReceiverInstallPlan, InterpretError>;
+    fn name(&self) -> String;
+}
+```
+
+Built-in receivers:
+- `DiscordReceiver` - Discord webhooks
+- `WebhookReceiver` - Generic HTTP webhooks
+
+### AlertRule
+
+An `AlertRule` defines a Prometheus alert expression. Rules are also parameterized by sender to handle different CRD formats.
+
+```rust
+pub trait AlertRule<S: AlertSender> {
+    fn build_rule(&self) -> Result<serde_json::Value, InterpretError>;
+    fn name(&self) -> String;
+}
+```
+
+### Observability Capability
+
+Topologies implement `Observability<S>` to indicate they support a specific alert sender:
+
+```rust
+impl Observability<OpenshiftClusterAlertSender> for K8sAnywhereTopology {
+    async fn install_receivers(&self, sender, inventory, receivers) { ... }
+    async fn install_rules(&self, sender, inventory, rules) { ... }
+    // ...
+}
+```
+
+This provides **compile-time verification**: if you try to use `OpenshiftClusterAlertScore` with a topology that doesn't implement `Observability<OpenshiftClusterAlertSender>`, the code won't compile.
+
+---
+
+## Level 1: Cluster Monitoring
+
+Cluster monitoring is for administrators who need full control over the monitoring infrastructure. This includes:
+- Installing/managing the monitoring stack
+- Configuring cluster-wide alert receivers
+- Defining cluster-level alert rules
+- Adding external scrape targets (e.g., bare-metal servers, firewalls)
+
+### Example: OKD Cluster Alerts
+
+```rust
+use harmony::{
+    modules::monitoring::{
+        alert_channel::discord_alert_channel::DiscordReceiver,
+        alert_rule::{alerts::k8s::pvc::high_pvc_fill_rate_over_two_days, prometheus_alert_rule::AlertManagerRuleGroup},
+        okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
+        scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
+    },
+    topology::{K8sAnywhereTopology, monitoring::{AlertMatcher, AlertRoute, MatchOp}},
+};
+
+let severity_matcher = AlertMatcher {
+    label: "severity".to_string(),
+    operator: MatchOp::Eq,
+    value: "critical".to_string(),
+};
+
+let rule_group = AlertManagerRuleGroup::new(
+    "cluster-rules",
+    vec![high_pvc_fill_rate_over_two_days()],
+);
+
+let external_exporter = PrometheusNodeExporter {
+    job_name: "firewall".to_string(),
+    metrics_path: "/metrics".to_string(),
+    listen_address: ip!("192.168.1.1"),
+    port: 9100,
+    ..Default::default()
+};
+
+harmony_cli::run(
+    Inventory::autoload(),
+    K8sAnywhereTopology::from_env(),
+    vec![Box::new(OpenshiftClusterAlertScore {
+        sender: OpenshiftClusterAlertSender,
+        receivers: vec![Box::new(DiscordReceiver {
+            name: "critical-alerts".to_string(),
+            url: hurl!("https://discord.com/api/webhooks/..."),
+            route: AlertRoute {
+                matchers: vec![severity_matcher],
+                ..AlertRoute::default("critical-alerts".to_string())
+            },
+        })],
+        rules: vec![Box::new(rule_group)],
+        scrape_targets: Some(vec![Box::new(external_exporter)]),
+    })],
+    None,
+).await?;
+```
+
+### What This Does
+
+1. **Enables cluster monitoring** - Activates OKD's built-in Prometheus
+2. **Enables user workload monitoring** - Allows namespace-scoped rules
+3. **Configures Alertmanager** - Adds Discord receiver with route matching
+4. **Deploys alert rules** - Creates `AlertingRule` CRD with PVC fill rate alert
+5. **Adds external scrape target** - Configures Prometheus to scrape the firewall
+
+### Compile-Time Safety
+
+The `OpenshiftClusterAlertScore` requires:
+
+```rust
+impl<T: Topology + Observability<OpenshiftClusterAlertSender>> Score<T>
+    for OpenshiftClusterAlertScore
+```
+
+If `K8sAnywhereTopology` didn't implement `Observability<OpenshiftClusterAlertSender>`, this code would fail to compile. You cannot accidentally deploy OKD alerts to a cluster that doesn't support them.
+
+---
+
+## Level 2: Tenant Monitoring
+
+In multi-tenant clusters, teams are often confined to specific namespaces. Tenant monitoring adapts to this constraint:
+
+- Resources are deployed in the tenant's namespace
+- Cannot modify cluster-level monitoring configuration
+- The topology determines namespace context at runtime
+
+### How It Works
+
+The topology's `Observability` implementation handles tenant scoping:
+
+```rust
+impl Observability<KubePrometheus> for K8sAnywhereTopology {
+    async fn install_rules(&self, sender, inventory, rules) {
+        // Topology knows if it's tenant-scoped
+        let namespace = self.get_tenant_config().await
+            .map(|t| t.name)
+            .unwrap_or_else(|| "monitoring".to_string());
+        
+        // Rules are installed in the appropriate namespace
+        for rule in rules.unwrap_or_default() {
+            let score = KubePrometheusRuleScore {
+                sender: sender.clone(),
+                rule,
+                namespace: namespace.clone(), // Tenant namespace
+            };
+            score.create_interpret().execute(inventory, self).await?;
+        }
+    }
+}
+```
+
+### Tenant vs Cluster Resources
+
+| Resource | Cluster-Level | Tenant-Level |
+|----------|---------------|--------------|
+| Alertmanager config | Global receivers | Namespaced receivers (where supported) |
+| PrometheusRules | Cluster-wide alerts | Namespace alerts only |
+| ServiceMonitors | Any namespace | Own namespace only |
+| External scrape targets | Can add | Cannot add (cluster config) |
+
+### Runtime Validation
+
+Tenant constraints are validated at runtime via Kubernetes RBAC. If a tenant-scoped deployment attempts cluster-level operations, it fails with a clear permission error from the Kubernetes API.
+
+This cannot be fully compile-time because tenant context is determined by who's running the code and what permissions they have—information only available at runtime.
+
+---
+
+## Level 3: Application Monitoring
+
+Application monitoring provides zero-config, opinionated monitoring for developers. Just add the `Monitoring` feature to your application and it works.
+
+### Example
+
+```rust
+use harmony::modules::{
+    application::{Application, ApplicationFeature},
+    monitoring::alert_channel::webhook_receiver::WebhookReceiver,
+};
+
+// Define your application
+let my_app = MyApplication::new();
+
+// Add monitoring as a feature
+let monitoring = Monitoring {
+    application: Arc::new(my_app),
+    alert_receiver: vec![], // Uses defaults
+};
+
+// Install with the application
+my_app.add_feature(monitoring);
+```
+
+### What Application Monitoring Provides
+
+1. **Automatic ServiceMonitor** - Creates a ServiceMonitor for your application's pods
+2. **Ntfy Notification Channel** - Auto-installs and configures Ntfy for push notifications
+3. **Tenant Awareness** - Automatically scopes to the correct namespace
+4. **Sensible Defaults** - Pre-configured alert routes and receivers
+
+### Under the Hood
+
+```rust
+impl<T: Topology + Observability<Prometheus> + TenantManager> 
+    ApplicationFeature<T> for Monitoring 
+{
+    async fn ensure_installed(&self, topology: &T) -> Result<...> {
+        // 1. Get tenant namespace (or use app name)
+        let namespace = topology.get_tenant_config().await
+            .map(|ns| ns.name.clone())
+            .unwrap_or_else(|| self.application.name());
+
+        // 2. Create ServiceMonitor for the app
+        let app_service_monitor = ServiceMonitor {
+            metadata: ObjectMeta {
+                name: Some(self.application.name()),
+                namespace: Some(namespace.clone()),
+                ..Default::default()
+            },
+            spec: ServiceMonitorSpec::default(),
+        };
+
+        // 3. Install Ntfy for notifications
+        let ntfy = NtfyScore { namespace, host };
+        ntfy.interpret(&Inventory::empty(), topology).await?;
+
+        // 4. Wire up webhook receiver to Ntfy
+        let ntfy_receiver = WebhookReceiver { ... };
+        
+        // 5. Execute monitoring score
+        alerting_score.interpret(&Inventory::empty(), topology).await?;
+    }
+}
+```
+
+---
+
+## Pre-Built Alert Rules
+
+Harmony provides a library of common alert rules in `modules/monitoring/alert_rule/alerts/`:
+
+### Kubernetes Alerts (`alerts/k8s/`)
+
+```rust
+use harmony::modules::monitoring::alert_rule::alerts::k8s::{
+    pod::pod_failed,
+    pvc::high_pvc_fill_rate_over_two_days,
+    memory_usage::alert_high_memory_usage,
+};
+
+let rules = AlertManagerRuleGroup::new("k8s-rules", vec![
+    pod_failed(),
+    high_pvc_fill_rate_over_two_days(),
+    alert_high_memory_usage(),
+]);
+```
+
+Available rules:
+- `pod_failed()` - Pod in failed state
+- `alert_container_restarting()` - Container restart loop
+- `alert_pod_not_ready()` - Pod not ready for extended period
+- `high_pvc_fill_rate_over_two_days()` - PVC will fill within 2 days
+- `alert_high_memory_usage()` - Memory usage above threshold
+- `alert_high_cpu_usage()` - CPU usage above threshold
+
+### Infrastructure Alerts (`alerts/infra/`)
+
+```rust
+use harmony::modules::monitoring::alert_rule::alerts::infra::opnsense::high_http_error_rate;
+
+let rules = AlertManagerRuleGroup::new("infra-rules", vec![
+    high_http_error_rate(),
+]);
+```
+
+### Creating Custom Rules
+
+```rust
+use harmony::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
+
+pub fn my_custom_alert() -> PrometheusAlertRule {
+    PrometheusAlertRule::new("MyServiceDown", "up{job=\"my-service\"} == 0")
+        .for_duration("5m")
+        .label("severity", "critical")
+        .annotation("summary", "My service is down")
+        .annotation("description", "The my-service job has been down for more than 5 minutes")
+}
+```
+
+---
+
+## Alert Receivers
+
+### Discord Webhook
+
+```rust
+use harmony::modules::monitoring::alert_channel::discord_alert_channel::DiscordReceiver;
+use harmony::topology::monitoring::{AlertRoute, AlertMatcher, MatchOp};
+
+let discord = DiscordReceiver {
+    name: "ops-alerts".to_string(),
+    url: hurl!("https://discord.com/api/webhooks/123456/abcdef"),
+    route: AlertRoute {
+        receiver: "ops-alerts".to_string(),
+        matchers: vec![AlertMatcher {
+            label: "severity".to_string(),
+            operator: MatchOp::Eq,
+            value: "critical".to_string(),
+        }],
+        group_by: vec!["alertname".to_string()],
+        repeat_interval: Some("30m".to_string()),
+        continue_matching: false,
+        children: vec![],
+    },
+};
+```
+
+### Generic Webhook
+
+```rust
+use harmony::modules::monitoring::alert_channel::webhook_receiver::WebhookReceiver;
+
+let webhook = WebhookReceiver {
+    name: "custom-webhook".to_string(),
+    url: hurl!("https://api.example.com/alerts"),
+    route: AlertRoute::default("custom-webhook".to_string()),
+};
+```
+
+---
+
+## Adding a New Monitoring Stack
+
+To add support for a new monitoring stack:
+
+1. **Create the sender type** in `modules/monitoring/my_sender/mod.rs`:
+   ```rust
+   #[derive(Debug, Clone)]
+   pub struct MySender;
+   
+   impl AlertSender for MySender {
+       fn name(&self) -> String { "MySender".to_string() }
+   }
+   ```
+
+2. **Define CRD types** in `modules/monitoring/my_sender/crd/`:
+   ```rust
+   #[derive(CustomResource, Debug, Serialize, Deserialize, Clone)]
+   #[kube(group = "monitoring.example.com", version = "v1", kind = "MyAlertRule")]
+   pub struct MyAlertRuleSpec { ... }
+   ```
+
+3. **Implement Observability** in `domain/topology/k8s_anywhere/observability/my_sender.rs`:
+   ```rust
+   impl Observability<MySender> for K8sAnywhereTopology {
+       async fn install_receivers(&self, sender, inventory, receivers) { ... }
+       async fn install_rules(&self, sender, inventory, rules) { ... }
+       // ...
+   }
+   ```
+
+4. **Implement receiver conversions** for existing receivers:
+   ```rust
+   impl AlertReceiver<MySender> for DiscordReceiver {
+       fn build(&self) -> Result<ReceiverInstallPlan, InterpretError> {
+           // Convert DiscordReceiver to MySender's format
+       }
+   }
+   ```
+
+5. **Create score types**:
+   ```rust
+   pub struct MySenderAlertScore {
+       pub sender: MySender,
+       pub receivers: Vec<Box<dyn AlertReceiver<MySender>>>,
+       pub rules: Vec<Box<dyn AlertRule<MySender>>>,
+   }
+   ```
+
+---
+
+## Architecture Principles
+
+### Type Safety Over Flexibility
+
+Each monitoring stack has distinct CRDs and configuration formats. Rather than a unified "MonitoringStack" type that loses stack-specific features, we use generic traits that provide type safety while allowing each stack to express its unique configuration.
+
+### Compile-Time Capability Verification
+
+The `Observability<S>` bound ensures you can't deploy OKD alerts to a KubePrometheus cluster. The compiler catches platform mismatches before deployment.
+
+### Explicit Over Implicit
+
+Monitoring stacks are chosen explicitly (`OpenshiftClusterAlertSender` vs `KubePrometheus`). There's no "auto-detection" that could lead to surprising behavior.
+
+### Three Levels, One Foundation
+
+Cluster, tenant, and application monitoring all use the same traits (`AlertSender`, `AlertReceiver`, `AlertRule`). The difference is in how scores are constructed and how topologies interpret them.
+
+---
+
+## Related Documentation
+
+- [ADR-020: Monitoring and Alerting Architecture](../adr/020-monitoring-alerting-architecture.md)
+- [ADR-013: Monitoring Notifications (ntfy)](../adr/013-monitoring-notifications.md)
+- [ADR-011: Multi-Tenant Cluster Architecture](../adr/011-multi-tenant-cluster.md)
+- [Coding Guide](coding-guide.md)
+- [Core Concepts](concepts.md)
--- a/examples/application_monitoring_with_tenant/src/main.rs
+++ b/examples/application_monitoring_with_tenant/src/main.rs
@@ -7,7 +7,7 @@ use harmony::{
        monitoring::alert_channel::webhook_receiver::WebhookReceiver,
        tenant::TenantScore,
    },
-    topology::{K8sAnywhereTopology, tenant::TenantConfig},
+    topology::{K8sAnywhereTopology, monitoring::AlertRoute, tenant::TenantConfig},
 };
 use harmony_types::id::Id;
 use harmony_types::net::Url;
@@ -33,9 +33,14 @@ async fn main() {
        service_port: 3000,
    });

+    let receiver_name = "sample-webhook-receiver".to_string();
+
    let webhook_receiver = WebhookReceiver {
-        name: "sample-webhook-receiver".to_string(),
+        name: receiver_name.clone(),
        url: Url::Url(url::Url::parse("https://webhook-doesnt-exist.com").unwrap()),
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

    let app = ApplicationScore {
--- a/examples/brocade_switch/src/main.rs
+++ b/examples/brocade_switch/src/main.rs
@@ -1,22 +1,28 @@
 use std::str::FromStr;

-use async_trait::async_trait;
 use brocade::{BrocadeOptions, PortOperatingMode};
 use harmony::{
-    data::Version,
-    infra::brocade::BrocadeSwitchClient,
-    interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
+    infra::brocade::BrocadeSwitchConfig,
    inventory::Inventory,
-    score::Score,
-    topology::{
-        HostNetworkConfig, PortConfig, PreparationError, PreparationOutcome, Switch, SwitchClient,
-        SwitchError, Topology,
-    },
+    modules::brocade::{BrocadeSwitchAuth, BrocadeSwitchScore, SwitchTopology},
 };
 use harmony_macros::ip;
-use harmony_types::{id::Id, net::MacAddress, switch::PortLocation};
-use log::{debug, info};
-use serde::Serialize;
+use harmony_types::{id::Id, switch::PortLocation};
+
+fn get_switch_config() -> BrocadeSwitchConfig {
+    let mut options = BrocadeOptions::default();
+    options.ssh.port = 2222;
+    let auth = BrocadeSwitchAuth {
+        username: "admin".to_string(),
+        password: "password".to_string(),
+    };
+
+    BrocadeSwitchConfig {
+        ips: vec![ip!("127.0.0.1")],
+        auth,
+        options,
+    }
+}

 #[tokio::main]
 async fn main() {
@@ -32,126 +38,13 @@ async fn main() {
            (PortLocation(1, 0, 18), PortOperatingMode::Trunk),
        ],
    };
+
    harmony_cli::run(
        Inventory::autoload(),
-        SwitchTopology::new().await,
+        SwitchTopology::new(get_switch_config()).await,
        vec![Box::new(switch_score)],
        None,
    )
    .await
    .unwrap();
 }
-
-#[derive(Clone, Debug, Serialize)]
-struct BrocadeSwitchScore {
-    port_channels_to_clear: Vec<Id>,
-    ports_to_configure: Vec<PortConfig>,
-}
-
-impl<T: Topology + Switch> Score<T> for BrocadeSwitchScore {
-    fn name(&self) -> String {
-        "BrocadeSwitchScore".to_string()
-    }
-
-    #[doc(hidden)]
-    fn create_interpret(&self) -> Box<dyn Interpret<T>> {
-        Box::new(BrocadeSwitchInterpret {
-            score: self.clone(),
-        })
-    }
-}
-
-#[derive(Debug)]
-struct BrocadeSwitchInterpret {
-    score: BrocadeSwitchScore,
-}
-
-#[async_trait]
-impl<T: Topology + Switch> Interpret<T> for BrocadeSwitchInterpret {
-    async fn execute(
-        &self,
-        _inventory: &Inventory,
-        topology: &T,
-    ) -> Result<Outcome, InterpretError> {
-        info!("Applying switch configuration {:?}", self.score);
-        debug!(
-            "Clearing port channel {:?}",
-            self.score.port_channels_to_clear
-        );
-        topology
-            .clear_port_channel(&self.score.port_channels_to_clear)
-            .await
-            .map_err(|e| InterpretError::new(e.to_string()))?;
-        debug!("Configuring interfaces {:?}", self.score.ports_to_configure);
-        topology
-            .configure_interface(&self.score.ports_to_configure)
-            .await
-            .map_err(|e| InterpretError::new(e.to_string()))?;
-        Ok(Outcome::success("switch configured".to_string()))
-    }
-    fn get_name(&self) -> InterpretName {
-        InterpretName::Custom("BrocadeSwitchInterpret")
-    }
-    fn get_version(&self) -> Version {
-        todo!()
-    }
-    fn get_status(&self) -> InterpretStatus {
-        todo!()
-    }
-    fn get_children(&self) -> Vec<Id> {
-        todo!()
-    }
-}
-
-struct SwitchTopology {
-    client: Box<dyn SwitchClient>,
-}
-
-#[async_trait]
-impl Topology for SwitchTopology {
-    fn name(&self) -> &str {
-        "SwitchTopology"
-    }
-
-    async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
-        Ok(PreparationOutcome::Noop)
-    }
-}
-
-impl SwitchTopology {
-    async fn new() -> Self {
-        let mut options = BrocadeOptions::default();
-        options.ssh.port = 2222;
-        let client =
-            BrocadeSwitchClient::init(&vec![ip!("127.0.0.1")], &"admin", &"password", options)
-                .await
-                .expect("Failed to connect to switch");
-
-        let client = Box::new(client);
-        Self { client }
-    }
-}
-
-#[async_trait]
-impl Switch for SwitchTopology {
-    async fn setup_switch(&self) -> Result<(), SwitchError> {
-        todo!()
-    }
-
-    async fn get_port_for_mac_address(
-        &self,
-        _mac_address: &MacAddress,
-    ) -> Result<Option<PortLocation>, SwitchError> {
-        todo!()
-    }
-
-    async fn configure_port_channel(&self, _config: &HostNetworkConfig) -> Result<(), SwitchError> {
-        todo!()
-    }
-    async fn clear_port_channel(&self, ids: &Vec<Id>) -> Result<(), SwitchError> {
-        self.client.clear_port_channel(ids).await
-    }
-    async fn configure_interface(&self, ports: &Vec<PortConfig>) -> Result<(), SwitchError> {
-        self.client.configure_interface(ports).await
-    }
-}
--- a/examples/cert_manager/Cargo.toml
+++ b/examples/cert_manager/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "example-nanodc"
+name = "cert_manager"
 edition = "2024"
 version.workspace = true
 readme.workspace = true
@@ -8,14 +8,12 @@ publish = false

 [dependencies]
 harmony = { path = "../../harmony" }
-harmony_tui = { path = "../../harmony_tui" }
+harmony_cli = { path = "../../harmony_cli" }
 harmony_types = { path = "../../harmony_types" }
 cidr = { workspace = true }
 tokio = { workspace = true }
 harmony_macros = { path = "../../harmony_macros" }
-harmony_secret = { path = "../../harmony_secret" }
 log = { workspace = true }
 env_logger = { workspace = true }
 url = { workspace = true }
-serde = { workspace = true }
-brocade = { path = "../../brocade" }
+assert_cmd = "2.0.16"
--- a/examples/cert_manager/src/main.rs
+++ b/examples/cert_manager/src/main.rs
@@ -0,0 +1,42 @@
+use harmony::{
+    inventory::Inventory,
+    modules::cert_manager::{
+        capability::CertificateManagementConfig, score_certificate::CertificateScore,
+        score_issuer::CertificateIssuerScore,
+    },
+    topology::K8sAnywhereTopology,
+};
+
+#[tokio::main]
+async fn main() {
+    let config = CertificateManagementConfig {
+        namespace: Some("test".to_string()),
+        acme_issuer: None,
+        ca_issuer: None,
+        self_signed: true,
+    };
+
+    let issuer_name = "test-self-signed-issuer".to_string();
+    let issuer = CertificateIssuerScore {
+        issuer_name: issuer_name.clone(),
+        config: config.clone(),
+    };
+
+    let cert = CertificateScore {
+        config: config.clone(),
+        issuer_name,
+        cert_name: "test-self-signed-cert".to_string(),
+        common_name: None,
+        dns_names: Some(vec!["test.dns.name".to_string()]),
+        is_ca: Some(false),
+    };
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        K8sAnywhereTopology::from_env(),
+        vec![Box::new(issuer), Box::new(cert)],
+        None,
+    )
+    .await
+    .unwrap();
+}
--- a/examples/ha_cluster/README.md
+++ b/examples/ha_cluster/README.md
@@ -1,15 +0,0 @@
-## OPNSense demo
-
-Download the virtualbox snapshot from {{TODO URL}}
-
-Start the virtualbox image
-
-This virtualbox image is configured to use a bridge on the host's physical interface, make sure the bridge is up and the virtual machine can reach internet.
-
-Credentials are opnsense default (root/opnsense)
-
-Run the project with the correct ip address on the command line :
-
-```bash
-cargo run -p example-opnsense -- 192.168.5.229
-```
--- a/examples/ha_cluster/src/main.rs
+++ b/examples/ha_cluster/src/main.rs
@@ -1,143 +0,0 @@
-use std::{
-    net::{IpAddr, Ipv4Addr},
-    sync::{Arc, OnceLock},
-};
-
-use brocade::BrocadeOptions;
-use cidr::Ipv4Cidr;
-use harmony::{
-    hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
-    infra::{brocade::BrocadeSwitchClient, opnsense::OPNSenseManagementInterface},
-    inventory::Inventory,
-    modules::{
-        dummy::{ErrorScore, PanicScore, SuccessScore},
-        http::StaticFilesHttpScore,
-        okd::{dhcp::OKDDhcpScore, dns::OKDDnsScore, load_balancer::OKDLoadBalancerScore},
-        opnsense::OPNsenseShellCommandScore,
-        tftp::TftpScore,
-    },
-    topology::{LogicalHost, UnmanagedRouter},
-};
-use harmony_macros::{ip, mac_address};
-use harmony_secret::{Secret, SecretManager};
-use harmony_types::net::Url;
-use serde::{Deserialize, Serialize};
-
-#[tokio::main]
-async fn main() {
-    let firewall = harmony::topology::LogicalHost {
-        ip: ip!("192.168.5.229"),
-        name: String::from("opnsense-1"),
-    };
-
-    let switch_auth = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
-        .await
-        .expect("Failed to get credentials");
-
-    let switches: Vec<IpAddr> = vec![ip!("192.168.5.101")]; // TODO: Adjust me
-    let brocade_options = BrocadeOptions {
-        dry_run: *harmony::config::DRY_RUN,
-        ..Default::default()
-    };
-    let switch_client = BrocadeSwitchClient::init(
-        &switches,
-        &switch_auth.username,
-        &switch_auth.password,
-        brocade_options,
-    )
-    .await
-    .expect("Failed to connect to switch");
-
-    let switch_client = Arc::new(switch_client);
-
-    let opnsense = Arc::new(
-        harmony::infra::opnsense::OPNSenseFirewall::new(firewall, None, "root", "opnsense").await,
-    );
-    let lan_subnet = Ipv4Addr::new(10, 100, 8, 0);
-    let gateway_ipv4 = Ipv4Addr::new(10, 100, 8, 1);
-    let gateway_ip = IpAddr::V4(gateway_ipv4);
-    let topology = harmony::topology::HAClusterTopology {
-        kubeconfig: None,
-        domain_name: "demo.harmony.mcd".to_string(),
-        router: Arc::new(UnmanagedRouter::new(
-            gateway_ip,
-            Ipv4Cidr::new(lan_subnet, 24).unwrap(),
-        )),
-        load_balancer: opnsense.clone(),
-        firewall: opnsense.clone(),
-        tftp_server: opnsense.clone(),
-        http_server: opnsense.clone(),
-        dhcp_server: opnsense.clone(),
-        dns_server: opnsense.clone(),
-        control_plane: vec![LogicalHost {
-            ip: ip!("10.100.8.20"),
-            name: "cp0".to_string(),
-        }],
-        bootstrap_host: LogicalHost {
-            ip: ip!("10.100.8.20"),
-            name: "cp0".to_string(),
-        },
-        workers: vec![],
-        switch_client: switch_client.clone(),
-        node_exporter: opnsense.clone(),
-        network_manager: OnceLock::new(),
-    };
-
-    let inventory = Inventory {
-        location: Location::new(
-            "232 des Éperviers, Wendake, Qc, G0A 4V0".to_string(),
-            "wk".to_string(),
-        ),
-        switch: SwitchGroup::from([]),
-        firewall_mgmt: Box::new(OPNSenseManagementInterface::new()),
-        storage_host: vec![],
-        worker_host: vec![],
-        control_plane_host: vec![
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("08:00:27:62:EC:C3")),
-        ],
-    };
-
-    // TODO regroup smaller scores in a larger one such as this
-    // let okd_boostrap_preparation();
-
-    let dhcp_score = OKDDhcpScore::new(&topology, &inventory);
-    let dns_score = OKDDnsScore::new(&topology);
-    let load_balancer_score = OKDLoadBalancerScore::new(&topology);
-
-    let tftp_score = TftpScore::new(Url::LocalFolder("./data/watchguard/tftpboot".to_string()));
-    let http_score = StaticFilesHttpScore {
-        folder_to_serve: Some(Url::LocalFolder(
-            "./data/watchguard/pxe-http-files".to_string(),
-        )),
-        files: vec![],
-        remote_path: None,
-    };
-
-    harmony_tui::run(
-        inventory,
-        topology,
-        vec![
-            Box::new(dns_score),
-            Box::new(dhcp_score),
-            Box::new(load_balancer_score),
-            Box::new(tftp_score),
-            Box::new(http_score),
-            Box::new(OPNsenseShellCommandScore {
-                opnsense: opnsense.get_opnsense_config(),
-                command: "touch /tmp/helloharmonytouching".to_string(),
-            }),
-            Box::new(SuccessScore {}),
-            Box::new(ErrorScore {}),
-            Box::new(PanicScore {}),
-        ],
-    )
-    .await
-    .unwrap();
-}
-
-#[derive(Secret, Serialize, Deserialize, Debug)]
-pub struct BrocadeSwitchAuth {
-    pub username: String,
-    pub password: String,
-}
--- a/examples/k8s_drain_node/Cargo.toml
+++ b/examples/k8s_drain_node/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "example-k8s-drain-node"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+publish = false
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+harmony_macros = { path = "../../harmony_macros" }
+harmony-k8s = { path = "../../harmony-k8s" }
+cidr.workspace = true
+tokio.workspace = true
+log.workspace = true
+env_logger.workspace = true
+url.workspace = true
+assert_cmd = "2.0.16"
+inquire.workspace = true
--- a/examples/k8s_drain_node/src/main.rs
+++ b/examples/k8s_drain_node/src/main.rs
@@ -0,0 +1,61 @@
+use std::time::Duration;
+
+use harmony_k8s::{DrainOptions, K8sClient};
+use log::{info, trace};
+
+#[tokio::main]
+async fn main() {
+    env_logger::init();
+    let k8s = K8sClient::try_default().await.unwrap();
+    let nodes = k8s.get_nodes(None).await.unwrap();
+    trace!("Got nodes : {nodes:#?}");
+    let node_names = nodes
+        .iter()
+        .map(|n| n.metadata.name.as_ref().unwrap())
+        .collect::<Vec<&String>>();
+
+    info!("Got nodes : {:?}", node_names);
+
+    let node_name = inquire::Select::new("What node do you want to operate on?", node_names)
+        .prompt()
+        .unwrap();
+
+    let drain = inquire::Confirm::new("Do you wish to drain the node now ?")
+        .prompt()
+        .unwrap();
+
+    if drain {
+        let mut options = DrainOptions::default_ignore_daemonset_delete_emptydir_data();
+        options.timeout = Duration::from_secs(1);
+        k8s.drain_node(&node_name, &options).await.unwrap();
+
+        info!("Node {node_name} successfully drained");
+    }
+
+    let uncordon =
+        inquire::Confirm::new("Do you wish to uncordon node to resume scheduling workloads now?")
+            .prompt()
+            .unwrap();
+
+    if uncordon {
+        info!("Uncordoning node {node_name}");
+        k8s.uncordon_node(node_name).await.unwrap();
+        info!("Node {node_name} uncordoned");
+    }
+
+    let reboot = inquire::Confirm::new("Do you wish to reboot node now?")
+        .prompt()
+        .unwrap();
+
+    if reboot {
+        k8s.reboot_node(
+            &node_name,
+            &DrainOptions::default_ignore_daemonset_delete_emptydir_data(),
+            Duration::from_secs(3600),
+        )
+        .await
+        .unwrap();
+    }
+
+    info!("All done playing with nodes, happy harmonizing!");
+}
--- a/examples/k8s_write_file_on_node/Cargo.toml
+++ b/examples/k8s_write_file_on_node/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "example-k8s-write-file-on-node"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+publish = false
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+harmony_macros = { path = "../../harmony_macros" }
+harmony-k8s = { path = "../../harmony-k8s" }
+cidr.workspace = true
+tokio.workspace = true
+log.workspace = true
+env_logger.workspace = true
+url.workspace = true
+assert_cmd = "2.0.16"
+inquire.workspace = true
--- a/examples/k8s_write_file_on_node/src/main.rs
+++ b/examples/k8s_write_file_on_node/src/main.rs
@@ -0,0 +1,45 @@
+use harmony_k8s::{K8sClient, NodeFile};
+use log::{info, trace};
+
+#[tokio::main]
+async fn main() {
+    env_logger::init();
+    let k8s = K8sClient::try_default().await.unwrap();
+    let nodes = k8s.get_nodes(None).await.unwrap();
+    trace!("Got nodes : {nodes:#?}");
+    let node_names = nodes
+        .iter()
+        .map(|n| n.metadata.name.as_ref().unwrap())
+        .collect::<Vec<&String>>();
+
+    info!("Got nodes : {:?}", node_names);
+
+    let node = inquire::Select::new("What node do you want to write file to?", node_names)
+        .prompt()
+        .unwrap();
+
+    let path = inquire::Text::new("File path on node").prompt().unwrap();
+    let content = inquire::Text::new("File content").prompt().unwrap();
+
+    let node_file = NodeFile {
+        path: path,
+        content: content,
+        mode: 0o600,
+    };
+
+    k8s.write_files_to_node(&node, &vec![node_file.clone()])
+        .await
+        .unwrap();
+
+    let cmd = inquire::Text::new("Command to run on node")
+        .prompt()
+        .unwrap();
+    k8s.run_privileged_command_on_node(&node, &cmd)
+        .await
+        .unwrap();
+
+    info!(
+        "File {} mode {} written in node {node}",
+        node_file.path, node_file.mode
+    );
+}
--- a/examples/monitoring/src/main.rs
+++ b/examples/monitoring/src/main.rs
@@ -1,37 +1,45 @@
-use std::collections::HashMap;
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+};

 use harmony::{
    inventory::Inventory,
-    modules::{
-        monitoring::{
-            alert_channel::discord_alert_channel::DiscordWebhook,
-            alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
-            kube_prometheus::{
-                helm_prometheus_alert_score::HelmPrometheusAlertingScore,
-                types::{
-                    HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
-                    ServiceMonitorEndpoint,
+    modules::monitoring::{
+        alert_channel::discord_alert_channel::DiscordReceiver,
+        alert_rule::{
+            alerts::{
+                infra::dell_server::{
+                    alert_global_storage_status_critical,
+                    alert_global_storage_status_non_recoverable,
+                    global_storage_status_degraded_non_critical,
                },
+                k8s::pvc::high_pvc_fill_rate_over_two_days,
            },
+            prometheus_alert_rule::AlertManagerRuleGroup,
        },
-        prometheus::alerts::{
-            infra::dell_server::{
-                alert_global_storage_status_critical, alert_global_storage_status_non_recoverable,
-                global_storage_status_degraded_non_critical,
+        kube_prometheus::{
+            helm::config::KubePrometheusConfig,
+            kube_prometheus_alerting_score::KubePrometheusAlertingScore,
+            types::{
+                HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
+                ServiceMonitorEndpoint,
            },
-            k8s::pvc::high_pvc_fill_rate_over_two_days,
        },
    },
-    topology::K8sAnywhereTopology,
+    topology::{K8sAnywhereTopology, monitoring::AlertRoute},
 };
 use harmony_types::{k8s_name::K8sName, net::Url};

 #[tokio::main]
 async fn main() {
-    let discord_receiver = DiscordWebhook {
-        name: K8sName("test-discord".to_string()),
+    let receiver_name = "test-discord".to_string();
+    let discord_receiver = DiscordReceiver {
+        name: receiver_name.clone(),
        url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
-        selectors: vec![],
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

    let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
@@ -70,10 +78,15 @@ async fn main() {
        endpoints: vec![service_monitor_endpoint],
        ..Default::default()
    };
-    let alerting_score = HelmPrometheusAlertingScore {
+
+    let config = Arc::new(Mutex::new(KubePrometheusConfig::new()));
+
+    let alerting_score = KubePrometheusAlertingScore {
        receivers: vec![Box::new(discord_receiver)],
        rules: vec![Box::new(additional_rules), Box::new(additional_rules2)],
        service_monitors: vec![service_monitor],
+        scrape_targets: None,
+        config,
    };

    harmony_cli::run(
--- a/examples/monitoring_with_tenant/src/main.rs
+++ b/examples/monitoring_with_tenant/src/main.rs
@@ -1,24 +1,32 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::HashMap,
+    str::FromStr,
+    sync::{Arc, Mutex},
+};

 use harmony::{
    inventory::Inventory,
    modules::{
        monitoring::{
-            alert_channel::discord_alert_channel::DiscordWebhook,
-            alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
+            alert_channel::discord_alert_channel::DiscordReceiver,
+            alert_rule::{
+                alerts::k8s::pvc::high_pvc_fill_rate_over_two_days,
+                prometheus_alert_rule::AlertManagerRuleGroup,
+            },
            kube_prometheus::{
-                helm_prometheus_alert_score::HelmPrometheusAlertingScore,
+                helm::config::KubePrometheusConfig,
+                kube_prometheus_alerting_score::KubePrometheusAlertingScore,
                types::{
                    HTTPScheme, MatchExpression, Operator, Selector, ServiceMonitor,
                    ServiceMonitorEndpoint,
                },
            },
        },
-        prometheus::alerts::k8s::pvc::high_pvc_fill_rate_over_two_days,
        tenant::TenantScore,
    },
    topology::{
        K8sAnywhereTopology,
+        monitoring::AlertRoute,
        tenant::{ResourceLimits, TenantConfig, TenantNetworkPolicy},
    },
 };
@@ -42,10 +50,13 @@ async fn main() {
        },
    };

-    let discord_receiver = DiscordWebhook {
-        name: K8sName("test-discord".to_string()),
+    let receiver_name = "test-discord".to_string();
+    let discord_receiver = DiscordReceiver {
+        name: receiver_name.clone(),
        url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
-        selectors: vec![],
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

    let high_pvc_fill_rate_over_two_days_alert = high_pvc_fill_rate_over_two_days();
@@ -74,10 +85,14 @@ async fn main() {
        ..Default::default()
    };

-    let alerting_score = HelmPrometheusAlertingScore {
+    let config = Arc::new(Mutex::new(KubePrometheusConfig::new()));
+
+    let alerting_score = KubePrometheusAlertingScore {
        receivers: vec![Box::new(discord_receiver)],
        rules: vec![Box::new(additional_rules)],
        service_monitors: vec![service_monitor],
+        scrape_targets: None,
+        config,
    };

    harmony_cli::run(
--- a/examples/nanodc/rook-cephcluster/install-rook-cephcluster.sh
+++ b/examples/nanodc/rook-cephcluster/install-rook-cephcluster.sh
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-helm install --create-namespace --namespace rook-ceph rook-ceph-cluster \
-   --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster -f values.yaml
--- a/examples/nanodc/rook-cephcluster/values.yaml
+++ b/examples/nanodc/rook-cephcluster/values.yaml
@@ -1,721 +0,0 @@
-# Default values for a single rook-ceph cluster
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-# -- Namespace of the main rook operator
-operatorNamespace: rook-ceph
-
-# -- The metadata.name of the CephCluster CR
-# @default -- The same as the namespace
-clusterName:
-
-# -- Optional override of the target kubernetes version
-kubeVersion:
-
-# -- Cluster ceph.conf override
-configOverride:
-# configOverride: |
-#   [global]
-#   mon_allow_pool_delete = true
-#   osd_pool_default_size = 3
-#   osd_pool_default_min_size = 2
-
-# Installs a debugging toolbox deployment
-toolbox:
-  # -- Enable Ceph debugging pod deployment. See [toolbox](../Troubleshooting/ceph-toolbox.md)
-  enabled: true
-  # -- Toolbox image, defaults to the image used by the Ceph cluster
-  image: #quay.io/ceph/ceph:v19.2.2
-  # -- Toolbox tolerations
-  tolerations: []
-  # -- Toolbox affinity
-  affinity: {}
-  # -- Toolbox container security context
-  containerSecurityContext:
-    runAsNonRoot: true
-    runAsUser: 2016
-    runAsGroup: 2016
-    capabilities:
-      drop: ["ALL"]
-  # -- Toolbox resources
-  resources:
-    limits:
-      memory: "1Gi"
-    requests:
-      cpu: "100m"
-      memory: "128Mi"
-  # -- Set the priority class for the toolbox if desired
-  priorityClassName:
-
-monitoring:
-  # -- Enable Prometheus integration, will also create necessary RBAC rules to allow Operator to create ServiceMonitors.
-  # Monitoring requires Prometheus to be pre-installed
-  enabled: false
-  # -- Whether to disable the metrics reported by Ceph. If false, the prometheus mgr module and Ceph exporter are enabled
-  metricsDisabled: false
-  # -- Whether to create the Prometheus rules for Ceph alerts
-  createPrometheusRules: false
-  # -- The namespace in which to create the prometheus rules, if different from the rook cluster namespace.
-  # If you have multiple rook-ceph clusters in the same k8s cluster, choose the same namespace (ideally, namespace with prometheus
-  # deployed) to set rulesNamespaceOverride for all the clusters. Otherwise, you will get duplicate alerts with multiple alert definitions.
-  rulesNamespaceOverride:
-  # Monitoring settings for external clusters:
-  # externalMgrEndpoints: <list of endpoints>
-  # externalMgrPrometheusPort: <port>
-  # Scrape interval for prometheus
-  # interval: 10s
-  # allow adding custom labels and annotations to the prometheus rule
-  prometheusRule:
-    # -- Labels applied to PrometheusRule
-    labels: {}
-    # -- Annotations applied to PrometheusRule
-    annotations: {}
-
-# -- Create & use PSP resources. Set this to the same value as the rook-ceph chart.
-pspEnable: false
-
-# imagePullSecrets option allow to pull docker images from private docker registry. Option will be passed to all service accounts.
-# imagePullSecrets:
-# - name: my-registry-secret
-
-# All values below are taken from the CephCluster CRD
-# -- Cluster configuration.
-# @default -- See [below](#ceph-cluster-spec)
-cephClusterSpec:
-  # This cluster spec example is for a converged cluster where all the Ceph daemons are running locally,
-  # as in the host-based example (cluster.yaml). For a different configuration such as a
-  # PVC-based cluster (cluster-on-pvc.yaml), external cluster (cluster-external.yaml),
-  # or stretch cluster (cluster-stretched.yaml), replace this entire `cephClusterSpec`
-  # with the specs from those examples.
-
-  # For more details, check https://rook.io/docs/rook/v1.10/CRDs/Cluster/ceph-cluster-crd/
-  cephVersion:
-    # The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
-    # v18 is Reef, v19 is Squid
-    # RECOMMENDATION: In production, use a specific version tag instead of the general v18 flag, which pulls the latest release and could result in different
-    # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
-    # If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v19.2.2-20250409
-    # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
-    image: quay.io/ceph/ceph:v19.2.2
-    # Whether to allow unsupported versions of Ceph. Currently Reef and Squid are supported.
-    # Future versions such as Tentacle (v20) would require this to be set to `true`.
-    # Do not set to true in production.
-    allowUnsupported: false
-
-  # The path on the host where configuration files will be persisted. Must be specified. If there are multiple clusters, the directory must be unique for each cluster.
-  # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
-  # In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
-  dataDirHostPath: /var/lib/rook
-
-  # Whether or not upgrade should continue even if a check fails
-  # This means Ceph's status could be degraded and we don't recommend upgrading but you might decide otherwise
-  # Use at your OWN risk
-  # To understand Rook's upgrade process of Ceph, read https://rook.io/docs/rook/v1.10/Upgrade/ceph-upgrade/
-  skipUpgradeChecks: false
-
-  # Whether or not continue if PGs are not clean during an upgrade
-  continueUpgradeAfterChecksEvenIfNotHealthy: false
-
-  # WaitTimeoutForHealthyOSDInMinutes defines the time (in minutes) the operator would wait before an OSD can be stopped for upgrade or restart.
-  # If the timeout exceeds and OSD is not ok to stop, then the operator would skip upgrade for the current OSD and proceed with the next one
-  # if `continueUpgradeAfterChecksEvenIfNotHealthy` is `false`. If `continueUpgradeAfterChecksEvenIfNotHealthy` is `true`, then operator would
-  # continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
-  # The default wait timeout is 10 minutes.
-  waitTimeoutForHealthyOSDInMinutes: 10
-
-  # Whether or not requires PGs are clean before an OSD upgrade. If set to `true` OSD upgrade process won't start until PGs are healthy.
-  # This configuration will be ignored if `skipUpgradeChecks` is `true`.
-  # Default is false.
-  upgradeOSDRequiresHealthyPGs: false
-
-  mon:
-    # Set the number of mons to be started. Generally recommended to be 3.
-    # For highest availability, an odd number of mons should be specified.
-    count: 3
-    # The mons should be on unique nodes. For production, at least 3 nodes are recommended for this reason.
-    # Mons should only be allowed on the same node for test environments where data loss is acceptable.
-    allowMultiplePerNode: false
-
-  mgr:
-    # When higher availability of the mgr is needed, increase the count to 2.
-    # In that case, one mgr will be active and one in standby. When Ceph updates which
-    # mgr is active, Rook will update the mgr services to match the active mgr.
-    count: 2
-    allowMultiplePerNode: false
-    modules:
-      # List of modules to optionally enable or disable.
-      # Note the "dashboard" and "monitoring" modules are already configured by other settings in the cluster CR.
-      # - name: rook
-      #   enabled: true
-
-  # enable the ceph dashboard for viewing cluster status
-  dashboard:
-    enabled: true
-    # serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)
-    # urlPrefix: /ceph-dashboard
-    # serve the dashboard at the given port.
-    # port: 8443
-    # Serve the dashboard using SSL (if using ingress to expose the dashboard and `ssl: true` you need to set
-    # the corresponding "backend protocol" annotation(s) for your ingress controller of choice)
-    ssl: true
-
-  # Network configuration, see: https://github.com/rook/rook/blob/master/Documentation/CRDs/Cluster/ceph-cluster-crd.md#network-configuration-settings
-  network:
-    connections:
-      # Whether to encrypt the data in transit across the wire to prevent eavesdropping the data on the network.
-      # The default is false. When encryption is enabled, all communication between clients and Ceph daemons, or between Ceph daemons will be encrypted.
-      # When encryption is not enabled, clients still establish a strong initial authentication and data integrity is still validated with a crc check.
-      # IMPORTANT: Encryption requires the 5.11 kernel for the latest nbd and cephfs drivers. Alternatively for testing only,
-      # you can set the "mounter: rbd-nbd" in the rbd storage class, or "mounter: fuse" in the cephfs storage class.
-      # The nbd and fuse drivers are *not* recommended in production since restarting the csi driver pod will disconnect the volumes.
-      encryption:
-        enabled: false
-      # Whether to compress the data in transit across the wire. The default is false.
-      # The kernel requirements above for encryption also apply to compression.
-      compression:
-        enabled: false
-      # Whether to require communication over msgr2. If true, the msgr v1 port (6789) will be disabled
-      # and clients will be required to connect to the Ceph cluster with the v2 port (3300).
-      # Requires a kernel that supports msgr v2 (kernel 5.11 or CentOS 8.4 or newer).
-      requireMsgr2: false
-  #   # enable host networking
-  #   provider: host
-  #   # EXPERIMENTAL: enable the Multus network provider
-  #   provider: multus
-  #   selectors:
-  #     # The selector keys are required to be `public` and `cluster`.
-  #     # Based on the configuration, the operator will do the following:
-  #     #   1. if only the `public` selector key is specified both public_network and cluster_network Ceph settings will listen on that interface
-  #     #   2. if both `public` and `cluster` selector keys are specified the first one will point to 'public_network' flag and the second one to 'cluster_network'
-  #     #
-  #     # In order to work, each selector value must match a NetworkAttachmentDefinition object in Multus
-  #     #
-  #     # public: public-conf --> NetworkAttachmentDefinition object name in Multus
-  #     # cluster: cluster-conf --> NetworkAttachmentDefinition object name in Multus
-  #   # Provide internet protocol version. IPv6, IPv4 or empty string are valid options. Empty string would mean IPv4
-  #   ipFamily: "IPv6"
-  #   # Ceph daemons to listen on both IPv4 and Ipv6 networks
-  #   dualStack: false
-
-  # enable the crash collector for ceph daemon crash collection
-  crashCollector:
-    disable: false
-    # Uncomment daysToRetain to prune ceph crash entries older than the
-    # specified number of days.
-    # daysToRetain: 30
-
-  # enable log collector, daemons will log on files and rotate
-  logCollector:
-    enabled: true
-    periodicity: daily # one of: hourly, daily, weekly, monthly
-    maxLogSize: 500M # SUFFIX may be 'M' or 'G'. Must be at least 1M.
-
-  # automate [data cleanup process](https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/ceph-teardown.md#delete-the-data-on-hosts) in cluster destruction.
-  cleanupPolicy:
-    # Since cluster cleanup is destructive to data, confirmation is required.
-    # To destroy all Rook data on hosts during uninstall, confirmation must be set to "yes-really-destroy-data".
-    # This value should only be set when the cluster is about to be deleted. After the confirmation is set,
-    # Rook will immediately stop configuring the cluster and only wait for the delete command.
-    # If the empty string is set, Rook will not destroy any data on hosts during uninstall.
-    confirmation: ""
-    # sanitizeDisks represents settings for sanitizing OSD disks on cluster deletion
-    sanitizeDisks:
-      # method indicates if the entire disk should be sanitized or simply ceph's metadata
-      # in both case, re-install is possible
-      # possible choices are 'complete' or 'quick' (default)
-      method: quick
-      # dataSource indicate where to get random bytes from to write on the disk
-      # possible choices are 'zero' (default) or 'random'
-      # using random sources will consume entropy from the system and will take much more time then the zero source
-      dataSource: zero
-      # iteration overwrite N times instead of the default (1)
-      # takes an integer value
-      iteration: 1
-    # allowUninstallWithVolumes defines how the uninstall should be performed
-    # If set to true, cephCluster deletion does not wait for the PVs to be deleted.
-    allowUninstallWithVolumes: false
-
-  # To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
-  # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
-  # tolerate taints with a key of 'storage-node'.
-  # placement:
-  #   all:
-  #     nodeAffinity:
-  #       requiredDuringSchedulingIgnoredDuringExecution:
-  #         nodeSelectorTerms:
-  #           - matchExpressions:
-  #             - key: role
-  #               operator: In
-  #               values:
-  #               - storage-node
-  #     podAffinity:
-  #     podAntiAffinity:
-  #     topologySpreadConstraints:
-  #     tolerations:
-  #     - key: storage-node
-  #       operator: Exists
-  #   # The above placement information can also be specified for mon, osd, and mgr components
-  #   mon:
-  #   # Monitor deployments may contain an anti-affinity rule for avoiding monitor
-  #   # collocation on the same node. This is a required rule when host network is used
-  #   # or when AllowMultiplePerNode is false. Otherwise this anti-affinity rule is a
-  #   # preferred rule with weight: 50.
-  #   osd:
-  #   mgr:
-  #   cleanup:
-
-  # annotations:
-  #   all:
-  #   mon:
-  #   osd:
-  #   cleanup:
-  #   prepareosd:
-  #   # If no mgr annotations are set, prometheus scrape annotations will be set by default.
-  #   mgr:
-  #   dashboard:
-
-  # labels:
-  #   all:
-  #   mon:
-  #   osd:
-  #   cleanup:
-  #   mgr:
-  #   prepareosd:
-  #   # monitoring is a list of key-value pairs. It is injected into all the monitoring resources created by operator.
-  #   # These labels can be passed as LabelSelector to Prometheus
-  #   monitoring:
-  #   dashboard:
-
-  resources:
-    mgr:
-      limits:
-        memory: "1Gi"
-      requests:
-        cpu: "500m"
-        memory: "512Mi"
-    mon:
-      limits:
-        memory: "2Gi"
-      requests:
-        cpu: "1000m"
-        memory: "1Gi"
-    osd:
-      limits:
-        memory: "4Gi"
-      requests:
-        cpu: "1000m"
-        memory: "4Gi"
-    prepareosd:
-      # limits: It is not recommended to set limits on the OSD prepare job
-      #         since it's a one-time burst for memory that must be allowed to
-      #         complete without an OOM kill.  Note however that if a k8s
-      #         limitRange guardrail is defined external to Rook, the lack of
-      #         a limit here may result in a sync failure, in which case a
-      #         limit should be added.  1200Mi may suffice for up to 15Ti
-      #         OSDs ; for larger devices 2Gi may be required.
-      #         cf. https://github.com/rook/rook/pull/11103
-      requests:
-        cpu: "500m"
-        memory: "50Mi"
-    mgr-sidecar:
-      limits:
-        memory: "100Mi"
-      requests:
-        cpu: "100m"
-        memory: "40Mi"
-    crashcollector:
-      limits:
-        memory: "60Mi"
-      requests:
-        cpu: "100m"
-        memory: "60Mi"
-    logcollector:
-      limits:
-        memory: "1Gi"
-      requests:
-        cpu: "100m"
-        memory: "100Mi"
-    cleanup:
-      limits:
-        memory: "1Gi"
-      requests:
-        cpu: "500m"
-        memory: "100Mi"
-    exporter:
-      limits:
-        memory: "128Mi"
-      requests:
-        cpu: "50m"
-        memory: "50Mi"
-
-  # The option to automatically remove OSDs that are out and are safe to destroy.
-  removeOSDsIfOutAndSafeToRemove: false
-
-  # priority classes to apply to ceph resources
-  priorityClassNames:
-    mon: system-node-critical
-    osd: system-node-critical
-    mgr: system-cluster-critical
-
-  storage: # cluster level storage configuration and selection
-    useAllNodes: true
-    useAllDevices: true
-    # deviceFilter:
-    # config:
-    #   crushRoot: "custom-root" # specify a non-default root label for the CRUSH map
-    #   metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
-    #   databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
-    #   osdsPerDevice: "1" # this value can be overridden at the node or device level
-    #   encryptedDevice: "true" # the default value for this option is "false"
-    # # Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
-    # # nodes below will be used as storage resources. Each node's 'name' field should match their 'kubernetes.io/hostname' label.
-    # nodes:
-    #   - name: "172.17.4.201"
-    #     devices: # specific devices to use for storage can be specified for each node
-    #       - name: "sdb"
-    #       - name: "nvme01" # multiple osds can be created on high performance devices
-    #         config:
-    #           osdsPerDevice: "5"
-    #       - name: "/dev/disk/by-id/ata-ST4000DM004-XXXX" # devices can be specified using full udev paths
-    #     config: # configuration can be specified at the node level which overrides the cluster level config
-    #   - name: "172.17.4.301"
-    #     deviceFilter: "^sd."
-
-  # The section for configuring management of daemon disruptions during upgrade or fencing.
-  disruptionManagement:
-    # If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically
-    # via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will
-    # block eviction of OSDs by default and unblock them safely when drains are detected.
-    managePodBudgets: true
-    # A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
-    # default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
-    osdMaintenanceTimeout: 30
-
-  # Configure the healthcheck and liveness probes for ceph pods.
-  # Valid values for daemons are 'mon', 'osd', 'status'
-  healthCheck:
-    daemonHealth:
-      mon:
-        disabled: false
-        interval: 45s
-      osd:
-        disabled: false
-        interval: 60s
-      status:
-        disabled: false
-        interval: 60s
-    # Change pod liveness probe, it works for all mon, mgr, and osd pods.
-    livenessProbe:
-      mon:
-        disabled: false
-      mgr:
-        disabled: false
-      osd:
-        disabled: false
-
-ingress:
-  # -- Enable an ingress for the ceph-dashboard
-  dashboard:
-    # {}
-    # labels:
-       # external-dns/private: "true"
-     annotations:
-       "route.openshift.io/termination": "passthrough"
-       # external-dns.alpha.kubernetes.io/hostname: dashboard.example.com
-       # nginx.ingress.kubernetes.io/rewrite-target: /ceph-dashboard/$2
-       # If the dashboard has ssl: true the following will make sure the NGINX Ingress controller can expose the dashboard correctly
-       # nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
-       # nginx.ingress.kubernetes.io/server-snippet: |
-       #   proxy_ssl_verify off;
-     host:
-       name: ceph.apps.ncd0.harmony.mcd
-       path: null # TODO the chart does not allow removing the path, and it causes openshift to fail creating a route, because path is not supported with termination mode passthrough
-       pathType: ImplementationSpecific
-     tls:
-     - {}
-    #   secretName: testsecret-tls
-    # Note: Only one of ingress class annotation or the `ingressClassName:` can be used at a time
-    # to set the ingress class
-    # ingressClassName: openshift-default
-    # labels:
-    #   external-dns/private: "true"
-    # annotations:
-    #   external-dns.alpha.kubernetes.io/hostname: dashboard.example.com
-    #   nginx.ingress.kubernetes.io/rewrite-target: /ceph-dashboard/$2
-    # If the dashboard has ssl: true the following will make sure the NGINX Ingress controller can expose the dashboard correctly
-    #   nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
-    #   nginx.ingress.kubernetes.io/server-snippet: |
-    #     proxy_ssl_verify off;
-    # host:
-    #   name: dashboard.example.com
-    #   path: "/ceph-dashboard(/|$)(.*)"
-    #   pathType: Prefix
-    # tls:
-    # - hosts:
-    #     - dashboard.example.com
-    #   secretName: testsecret-tls
-    ## Note: Only one of ingress class annotation or the `ingressClassName:` can be used at a time
-    ## to set the ingress class
-    # ingressClassName: nginx
-
-# -- A list of CephBlockPool configurations to deploy
-# @default -- See [below](#ceph-block-pools)
-cephBlockPools:
-  - name: ceph-blockpool
-    # see https://github.com/rook/rook/blob/master/Documentation/CRDs/Block-Storage/ceph-block-pool-crd.md#spec for available configuration
-    spec:
-      failureDomain: host
-      replicated:
-        size: 3
-      # Enables collecting RBD per-image IO statistics by enabling dynamic OSD performance counters. Defaults to false.
-      # For reference: https://docs.ceph.com/docs/latest/mgr/prometheus/#rbd-io-statistics
-      # enableRBDStats: true
-    storageClass:
-      enabled: true
-      name: ceph-block
-      annotations: {}
-      labels: {}
-      isDefault: true
-      reclaimPolicy: Delete
-      allowVolumeExpansion: true
-      volumeBindingMode: "Immediate"
-      mountOptions: []
-      # see https://kubernetes.io/docs/concepts/storage/storage-classes/#allowed-topologies
-      allowedTopologies: []
-      #        - matchLabelExpressions:
-      #            - key: rook-ceph-role
-      #              values:
-      #                - storage-node
-      # see https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md#provision-storage for available configuration
-      parameters:
-        # (optional) mapOptions is a comma-separated list of map options.
-        # For krbd options refer
-        # https://docs.ceph.com/docs/latest/man/8/rbd/#kernel-rbd-krbd-options
-        # For nbd options refer
-        # https://docs.ceph.com/docs/latest/man/8/rbd-nbd/#options
-        # mapOptions: lock_on_read,queue_depth=1024
-
-        # (optional) unmapOptions is a comma-separated list of unmap options.
-        # For krbd options refer
-        # https://docs.ceph.com/docs/latest/man/8/rbd/#kernel-rbd-krbd-options
-        # For nbd options refer
-        # https://docs.ceph.com/docs/latest/man/8/rbd-nbd/#options
-        # unmapOptions: force
-
-        # RBD image format. Defaults to "2".
-        imageFormat: "2"
-
-        # RBD image features, equivalent to OR'd bitfield value: 63
-        # Available for imageFormat: "2". Older releases of CSI RBD
-        # support only the `layering` feature. The Linux kernel (KRBD) supports the
-        # full feature complement as of 5.4
-        imageFeatures: layering
-
-        # These secrets contain Ceph admin credentials.
-        csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
-        csi.storage.k8s.io/provisioner-secret-namespace: "{{ .Release.Namespace }}"
-        csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
-        csi.storage.k8s.io/controller-expand-secret-namespace: "{{ .Release.Namespace }}"
-        csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
-        csi.storage.k8s.io/node-stage-secret-namespace: "{{ .Release.Namespace }}"
-        # Specify the filesystem type of the volume. If not specified, csi-provisioner
-        # will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
-        # in hyperconverged settings where the volume is mounted on the same node as the osds.
-        csi.storage.k8s.io/fstype: ext4
-
-# -- A list of CephFileSystem configurations to deploy
-# @default -- See [below](#ceph-file-systems)
-cephFileSystems:
-  - name: ceph-filesystem
-    # see https://github.com/rook/rook/blob/master/Documentation/CRDs/Shared-Filesystem/ceph-filesystem-crd.md#filesystem-settings for available configuration
-    spec:
-      metadataPool:
-        replicated:
-          size: 3
-      dataPools:
-        - failureDomain: host
-          replicated:
-            size: 3
-          # Optional and highly recommended, 'data0' by default, see https://github.com/rook/rook/blob/master/Documentation/CRDs/Shared-Filesystem/ceph-filesystem-crd.md#pools
-          name: data0
-      metadataServer:
-        activeCount: 1
-        activeStandby: true
-        resources:
-          limits:
-            memory: "4Gi"
-          requests:
-            cpu: "1000m"
-            memory: "4Gi"
-        priorityClassName: system-cluster-critical
-    storageClass:
-      enabled: true
-      isDefault: false
-      name: ceph-filesystem
-      # (Optional) specify a data pool to use, must be the name of one of the data pools above, 'data0' by default
-      pool: data0
-      reclaimPolicy: Delete
-      allowVolumeExpansion: true
-      volumeBindingMode: "Immediate"
-      annotations: {}
-      labels: {}
-      mountOptions: []
-      # see https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/Shared-Filesystem-CephFS/filesystem-storage.md#provision-storage for available configuration
-      parameters:
-        # The secrets contain Ceph admin credentials.
-        csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner
-        csi.storage.k8s.io/provisioner-secret-namespace: "{{ .Release.Namespace }}"
-        csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner
-        csi.storage.k8s.io/controller-expand-secret-namespace: "{{ .Release.Namespace }}"
-        csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node
-        csi.storage.k8s.io/node-stage-secret-namespace: "{{ .Release.Namespace }}"
-        # Specify the filesystem type of the volume. If not specified, csi-provisioner
-        # will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
-        # in hyperconverged settings where the volume is mounted on the same node as the osds.
-        csi.storage.k8s.io/fstype: ext4
-
-# -- Settings for the filesystem snapshot class
-# @default -- See [CephFS Snapshots](../Storage-Configuration/Ceph-CSI/ceph-csi-snapshot.md#cephfs-snapshots)
-cephFileSystemVolumeSnapshotClass:
-  enabled: false
-  name: ceph-filesystem
-  isDefault: true
-  deletionPolicy: Delete
-  annotations: {}
-  labels: {}
-  # see https://rook.io/docs/rook/v1.10/Storage-Configuration/Ceph-CSI/ceph-csi-snapshot/#cephfs-snapshots for available configuration
-  parameters: {}
-
-# -- Settings for the block pool snapshot class
-# @default -- See [RBD Snapshots](../Storage-Configuration/Ceph-CSI/ceph-csi-snapshot.md#rbd-snapshots)
-cephBlockPoolsVolumeSnapshotClass:
-  enabled: false
-  name: ceph-block
-  isDefault: false
-  deletionPolicy: Delete
-  annotations: {}
-  labels: {}
-  # see https://rook.io/docs/rook/v1.10/Storage-Configuration/Ceph-CSI/ceph-csi-snapshot/#rbd-snapshots for available configuration
-  parameters: {}
-
-# -- A list of CephObjectStore configurations to deploy
-# @default -- See [below](#ceph-object-stores)
-cephObjectStores:
-  - name: ceph-objectstore
-    # see https://github.com/rook/rook/blob/master/Documentation/CRDs/Object-Storage/ceph-object-store-crd.md#object-store-settings for available configuration
-    spec:
-      metadataPool:
-        failureDomain: host
-        replicated:
-          size: 3
-      dataPool:
-        failureDomain: host
-        erasureCoded:
-          dataChunks: 2
-          codingChunks: 1
-        parameters:
-          bulk: "true"
-      preservePoolsOnDelete: true
-      gateway:
-        port: 80
-        resources:
-          limits:
-            memory: "2Gi"
-          requests:
-            cpu: "1000m"
-            memory: "1Gi"
-        # securePort: 443
-        # sslCertificateRef:
-        instances: 1
-        priorityClassName: system-cluster-critical
-        # opsLogSidecar:
-        #   resources:
-        #     limits:
-        #       memory: "100Mi"
-        #     requests:
-        #       cpu: "100m"
-        #       memory: "40Mi"
-    storageClass:
-      enabled: true
-      name: ceph-bucket
-      reclaimPolicy: Delete
-      volumeBindingMode: "Immediate"
-      annotations: {}
-      labels: {}
-      # see https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/Object-Storage-RGW/ceph-object-bucket-claim.md#storageclass for available configuration
-      parameters:
-        # note: objectStoreNamespace and objectStoreName are configured by the chart
-        region: us-east-1
-    ingress:
-      # Enable an ingress for the ceph-objectstore
-      enabled: true
-      # The ingress port by default will be the object store's "securePort" (if set), or the gateway "port".
-      # To override those defaults, set this ingress port to the desired port.
-      # port: 80
-      # annotations: {}
-      host:
-        name: objectstore.apps.ncd0.harmony.mcd
-        path: /
-        pathType: Prefix
-      # tls:
-      # - hosts:
-      #     - objectstore.example.com
-      #   secretName: ceph-objectstore-tls
-      # ingressClassName: nginx
-## cephECBlockPools are disabled by default, please remove the comments and set desired values to enable it
-## For erasure coded a replicated metadata pool is required.
-## https://rook.io/docs/rook/latest/CRDs/Shared-Filesystem/ceph-filesystem-crd/#erasure-coded
-#cephECBlockPools:
-#  - name: ec-pool
-#    spec:
-#      metadataPool:
-#        replicated:
-#          size: 2
-#      dataPool:
-#        failureDomain: osd
-#        erasureCoded:
-#          dataChunks: 2
-#          codingChunks: 1
-#        deviceClass: hdd
-#
-#    parameters:
-#      # clusterID is the namespace where the rook cluster is running
-#      # If you change this namespace, also change the namespace below where the secret namespaces are defined
-#      clusterID: rook-ceph # namespace:cluster
-#      # (optional) mapOptions is a comma-separated list of map options.
-#      # For krbd options refer
-#      # https://docs.ceph.com/docs/latest/man/8/rbd/#kernel-rbd-krbd-options
-#      # For nbd options refer
-#      # https://docs.ceph.com/docs/latest/man/8/rbd-nbd/#options
-#      # mapOptions: lock_on_read,queue_depth=1024
-#
-#      # (optional) unmapOptions is a comma-separated list of unmap options.
-#      # For krbd options refer
-#      # https://docs.ceph.com/docs/latest/man/8/rbd/#kernel-rbd-krbd-options
-#      # For nbd options refer
-#      # https://docs.ceph.com/docs/latest/man/8/rbd-nbd/#options
-#      # unmapOptions: force
-#
-#      # RBD image format. Defaults to "2".
-#      imageFormat: "2"
-#
-#      # RBD image features, equivalent to OR'd bitfield value: 63
-#      # Available for imageFormat: "2". Older releases of CSI RBD
-#      # support only the `layering` feature. The Linux kernel (KRBD) supports the
-#      # full feature complement as of 5.4
-#      # imageFeatures: layering,fast-diff,object-map,deep-flatten,exclusive-lock
-#      imageFeatures: layering
-#
-#    storageClass:
-#      provisioner: rook-ceph.rbd.csi.ceph.com # csi-provisioner-name
-#      enabled: true
-#      name: rook-ceph-block
-#      isDefault: false
-#      annotations: { }
-#      labels: { }
-#      allowVolumeExpansion: true
-#      reclaimPolicy: Delete
-
-# -- CSI driver name prefix for cephfs, rbd and nfs.
-# @default -- `namespace name where rook-ceph operator is deployed`
-csiDriverNamePrefix:
--- a/examples/nanodc/rook-operator/install-rook-operator.sh
+++ b/examples/nanodc/rook-operator/install-rook-operator.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-helm repo add rook-release https://charts.rook.io/release
-helm install --create-namespace --namespace rook-ceph rook-ceph rook-release/rook-ceph -f values.yaml
--- a/examples/nanodc/rook-operator/values.yaml
+++ b/examples/nanodc/rook-operator/values.yaml
@@ -1,674 +0,0 @@
-# Default values for rook-ceph-operator
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-image:
-  # -- Image
-  repository: docker.io/rook/ceph
-  # -- Image tag
-  # @default -- `master`
-  tag: v1.17.1
-  # -- Image pull policy
-  pullPolicy: IfNotPresent
-
-crds:
-  # -- Whether the helm chart should create and update the CRDs. If false, the CRDs must be
-  # managed independently with deploy/examples/crds.yaml.
-  # **WARNING** Only set during first deployment. If later disabled the cluster may be DESTROYED.
-  # If the CRDs are deleted in this case, see
-  # [the disaster recovery guide](https://rook.io/docs/rook/latest/Troubleshooting/disaster-recovery/#restoring-crds-after-deletion)
-  # to restore them.
-  enabled: true
-
-# -- Pod resource requests & limits
-resources:
-  limits:
-    memory: 512Mi
-  requests:
-    cpu: 200m
-    memory: 128Mi
-
-# -- Kubernetes [`nodeSelector`](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector) to add to the Deployment.
-nodeSelector: {}
-# Constraint rook-ceph-operator Deployment to nodes with label `disktype: ssd`.
-# For more info, see https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector
-#  disktype: ssd
-
-# -- List of Kubernetes [`tolerations`](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) to add to the Deployment.
-tolerations: []
-
-# -- Delay to use for the `node.kubernetes.io/unreachable` pod failure toleration to override
-# the Kubernetes default of 5 minutes
-unreachableNodeTolerationSeconds: 5
-
-# -- Whether the operator should watch cluster CRD in its own namespace or not
-currentNamespaceOnly: false
-
-# -- Custom pod labels for the operator
-operatorPodLabels: {}
-
-# -- Pod annotations
-annotations: {}
-
-# -- Global log level for the operator.
-# Options: `ERROR`, `WARNING`, `INFO`, `DEBUG`
-logLevel: INFO
-
-# -- If true, create & use RBAC resources
-rbacEnable: true
-
-rbacAggregate:
-  # -- If true, create a ClusterRole aggregated to [user facing roles](https://kubernetes.io/docs/reference/access-authn-authz/rbac/#user-facing-roles) for objectbucketclaims
-  enableOBCs: false
-
-# -- If true, create & use PSP resources
-pspEnable: false
-
-# -- Set the priority class for the rook operator deployment if desired
-priorityClassName:
-
-# -- Set the container security context for the operator
-containerSecurityContext:
-  runAsNonRoot: true
-  runAsUser: 2016
-  runAsGroup: 2016
-  capabilities:
-    drop: ["ALL"]
-# -- If true, loop devices are allowed to be used for osds in test clusters
-allowLoopDevices: false
-
-# Settings for whether to disable the drivers or other daemons if they are not
-# needed
-csi:
-  # -- Enable Ceph CSI RBD driver
-  enableRbdDriver: true
-  # -- Enable Ceph CSI CephFS driver
-  enableCephfsDriver: true
-  # -- Disable the CSI driver.
-  disableCsiDriver: "false"
-
-  # -- Enable host networking for CSI CephFS and RBD nodeplugins. This may be necessary
-  # in some network configurations where the SDN does not provide access to an external cluster or
-  # there is significant drop in read/write performance
-  enableCSIHostNetwork: true
-  # -- Enable Snapshotter in CephFS provisioner pod
-  enableCephfsSnapshotter: true
-  # -- Enable Snapshotter in NFS provisioner pod
-  enableNFSSnapshotter: true
-  # -- Enable Snapshotter in RBD provisioner pod
-  enableRBDSnapshotter: true
-  # -- Enable Host mount for `/etc/selinux` directory for Ceph CSI nodeplugins
-  enablePluginSelinuxHostMount: false
-  # -- Enable Ceph CSI PVC encryption support
-  enableCSIEncryption: false
-
-  # -- Enable volume group snapshot feature. This feature is
-  # enabled by default as long as the necessary CRDs are available in the cluster.
-  enableVolumeGroupSnapshot: true
-  # -- PriorityClassName to be set on csi driver plugin pods
-  pluginPriorityClassName: system-node-critical
-
-  # -- PriorityClassName to be set on csi driver provisioner pods
-  provisionerPriorityClassName: system-cluster-critical
-
-  # -- Policy for modifying a volume's ownership or permissions when the RBD PVC is being mounted.
-  # supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
-  rbdFSGroupPolicy: "File"
-
-  # -- Policy for modifying a volume's ownership or permissions when the CephFS PVC is being mounted.
-  # supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
-  cephFSFSGroupPolicy: "File"
-
-  # -- Policy for modifying a volume's ownership or permissions when the NFS PVC is being mounted.
-  # supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
-  nfsFSGroupPolicy: "File"
-
-  # -- OMAP generator generates the omap mapping between the PV name and the RBD image
-  # which helps CSI to identify the rbd images for CSI operations.
-  # `CSI_ENABLE_OMAP_GENERATOR` needs to be enabled when we are using rbd mirroring feature.
-  # By default OMAP generator is disabled and when enabled, it will be deployed as a
-  # sidecar with CSI provisioner pod, to enable set it to true.
-  enableOMAPGenerator: false
-
-  # -- Set CephFS Kernel mount options to use https://docs.ceph.com/en/latest/man/8/mount.ceph/#options.
-  # Set to "ms_mode=secure" when connections.encrypted is enabled in CephCluster CR
-  cephFSKernelMountOptions:
-
-  # -- Enable adding volume metadata on the CephFS subvolumes and RBD images.
-  # Not all users might be interested in getting volume/snapshot details as metadata on CephFS subvolume and RBD images.
-  # Hence enable metadata is false by default
-  enableMetadata: false
-
-  # -- Set replicas for csi provisioner deployment
-  provisionerReplicas: 2
-
-  # -- Cluster name identifier to set as metadata on the CephFS subvolume and RBD images. This will be useful
-  # in cases like for example, when two container orchestrator clusters (Kubernetes/OCP) are using a single ceph cluster
-  clusterName:
-
-  # -- Set logging level for cephCSI containers maintained by the cephCSI.
-  # Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity.
-  logLevel: 0
-
-  # -- Set logging level for Kubernetes-csi sidecar containers.
-  # Supported values from 0 to 5. 0 for general useful logs (the default), 5 for trace level verbosity.
-  # @default -- `0`
-  sidecarLogLevel:
-
-  # -- CSI driver name prefix for cephfs, rbd and nfs.
-  # @default -- `namespace name where rook-ceph operator is deployed`
-  csiDriverNamePrefix:
-
-  # -- CSI RBD plugin daemonset update strategy, supported values are OnDelete and RollingUpdate
-  # @default -- `RollingUpdate`
-  rbdPluginUpdateStrategy:
-
-  # -- A maxUnavailable parameter of CSI RBD plugin daemonset update strategy.
-  # @default -- `1`
-  rbdPluginUpdateStrategyMaxUnavailable:
-
-  # -- CSI CephFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate
-  # @default -- `RollingUpdate`
-  cephFSPluginUpdateStrategy:
-
-  # -- A maxUnavailable parameter of CSI cephFS plugin daemonset update strategy.
-  # @default -- `1`
-  cephFSPluginUpdateStrategyMaxUnavailable:
-
-  # -- CSI NFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate
-  # @default -- `RollingUpdate`
-  nfsPluginUpdateStrategy:
-
-  # -- Set GRPC timeout for csi containers (in seconds). It should be >= 120. If this value is not set or is invalid, it defaults to 150
-  grpcTimeoutInSeconds: 150
-
-  # -- Burst to use while communicating with the kubernetes apiserver.
-  kubeApiBurst:
-
-  # -- QPS to use while communicating with the kubernetes apiserver.
-  kubeApiQPS:
-
-  # -- The volume of the CephCSI RBD plugin DaemonSet
-  csiRBDPluginVolume:
-  #  - name: lib-modules
-  #    hostPath:
-  #      path: /run/booted-system/kernel-modules/lib/modules/
-  #  - name: host-nix
-  #    hostPath:
-  #      path: /nix
-
-  # -- The volume mounts of the CephCSI RBD plugin DaemonSet
-  csiRBDPluginVolumeMount:
-  #  - name: host-nix
-  #    mountPath: /nix
-  #    readOnly: true
-
-  # -- The volume of the CephCSI CephFS plugin DaemonSet
-  csiCephFSPluginVolume:
-  #  - name: lib-modules
-  #    hostPath:
-  #      path: /run/booted-system/kernel-modules/lib/modules/
-  #  - name: host-nix
-  #    hostPath:
-  #      path: /nix
-
-  # -- The volume mounts of the CephCSI CephFS plugin DaemonSet
-  csiCephFSPluginVolumeMount:
-  #  - name: host-nix
-  #    mountPath: /nix
-  #    readOnly: true
-
-  # -- CEPH CSI RBD provisioner resource requirement list
-  # csi-omap-generator resources will be applied only if `enableOMAPGenerator` is set to `true`
-  # @default -- see values.yaml
-  csiRBDProvisionerResource: |
-    - name : csi-provisioner
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-resizer
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-attacher
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-snapshotter
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-rbdplugin
-      resource:
-        requests:
-          memory: 512Mi
-        limits:
-          memory: 1Gi
-    - name : csi-omap-generator
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-    - name : liveness-prometheus
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-
-  # -- CEPH CSI RBD plugin resource requirement list
-  # @default -- see values.yaml
-  csiRBDPluginResource: |
-    - name : driver-registrar
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-    - name : csi-rbdplugin
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-    - name : liveness-prometheus
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-
-  # -- CEPH CSI CephFS provisioner resource requirement list
-  # @default -- see values.yaml
-  csiCephFSProvisionerResource: |
-    - name : csi-provisioner
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-resizer
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-attacher
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-snapshotter
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-cephfsplugin
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-    - name : liveness-prometheus
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-
-  # -- CEPH CSI CephFS plugin resource requirement list
-  # @default -- see values.yaml
-  csiCephFSPluginResource: |
-    - name : driver-registrar
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-    - name : csi-cephfsplugin
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-    - name : liveness-prometheus
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-
-  # -- CEPH CSI NFS provisioner resource requirement list
-  # @default -- see values.yaml
-  csiNFSProvisionerResource: |
-    - name : csi-provisioner
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 100m
-        limits:
-          memory: 256Mi
-    - name : csi-nfsplugin
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-    - name : csi-attacher
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-
-  # -- CEPH CSI NFS plugin resource requirement list
-  # @default -- see values.yaml
-  csiNFSPluginResource: |
-    - name : driver-registrar
-      resource:
-        requests:
-          memory: 128Mi
-          cpu: 50m
-        limits:
-          memory: 256Mi
-    - name : csi-nfsplugin
-      resource:
-        requests:
-          memory: 512Mi
-          cpu: 250m
-        limits:
-          memory: 1Gi
-
-  # Set provisionerTolerations and provisionerNodeAffinity for provisioner pod.
-  # The CSI provisioner would be best to start on the same nodes as other ceph daemons.
-
-  # -- Array of tolerations in YAML format which will be added to CSI provisioner deployment
-  provisionerTolerations:
-  #    - key: key
-  #      operator: Exists
-  #      effect: NoSchedule
-
-  # -- The node labels for affinity of the CSI provisioner deployment [^1]
-  provisionerNodeAffinity: #key1=value1,value2; key2=value3
-  # Set pluginTolerations and pluginNodeAffinity for plugin daemonset pods.
-  # The CSI plugins need to be started on all the nodes where the clients need to mount the storage.
-
-  # -- Array of tolerations in YAML format which will be added to CephCSI plugin DaemonSet
-  pluginTolerations:
-  #    - key: key
-  #      operator: Exists
-  #      effect: NoSchedule
-
-  # -- The node labels for affinity of the CephCSI RBD plugin DaemonSet [^1]
-  pluginNodeAffinity: # key1=value1,value2; key2=value3
-
-  # -- Enable Ceph CSI Liveness sidecar deployment
-  enableLiveness: false
-
-  # -- CSI CephFS driver metrics port
-  # @default -- `9081`
-  cephfsLivenessMetricsPort:
-
-  # -- CSI Addons server port
-  # @default -- `9070`
-  csiAddonsPort:
-  # -- CSI Addons server port for the RBD provisioner
-  # @default -- `9070`
-  csiAddonsRBDProvisionerPort:
-  # -- CSI Addons server port for the Ceph FS provisioner
-  # @default -- `9070`
-  csiAddonsCephFSProvisionerPort:
-
-  # -- Enable Ceph Kernel clients on kernel < 4.17. If your kernel does not support quotas for CephFS
-  # you may want to disable this setting. However, this will cause an issue during upgrades
-  # with the FUSE client. See the [upgrade guide](https://rook.io/docs/rook/v1.2/ceph-upgrade.html)
-  forceCephFSKernelClient: true
-
-  # -- Ceph CSI RBD driver metrics port
-  # @default -- `8080`
-  rbdLivenessMetricsPort:
-
-  serviceMonitor:
-    # -- Enable ServiceMonitor for Ceph CSI drivers
-    enabled: false
-    # -- Service monitor scrape interval
-    interval: 10s
-    # -- ServiceMonitor additional labels
-    labels: {}
-    # -- Use a different namespace for the ServiceMonitor
-    namespace:
-
-  # -- Kubelet root directory path (if the Kubelet uses a different path for the `--root-dir` flag)
-  # @default -- `/var/lib/kubelet`
-  kubeletDirPath:
-
-  # -- Duration in seconds that non-leader candidates will wait to force acquire leadership.
-  # @default -- `137s`
-  csiLeaderElectionLeaseDuration:
-
-  # -- Deadline in seconds that the acting leader will retry refreshing leadership before giving up.
-  # @default -- `107s`
-  csiLeaderElectionRenewDeadline:
-
-  # -- Retry period in seconds the LeaderElector clients should wait between tries of actions.
-  # @default -- `26s`
-  csiLeaderElectionRetryPeriod:
-
-  cephcsi:
-    # -- Ceph CSI image repository
-    repository: quay.io/cephcsi/cephcsi
-    # -- Ceph CSI image tag
-    tag: v3.14.0
-
-  registrar:
-    # -- Kubernetes CSI registrar image repository
-    repository: registry.k8s.io/sig-storage/csi-node-driver-registrar
-    # -- Registrar image tag
-    tag: v2.13.0
-
-  provisioner:
-    # -- Kubernetes CSI provisioner image repository
-    repository: registry.k8s.io/sig-storage/csi-provisioner
-    # -- Provisioner image tag
-    tag: v5.1.0
-
-  snapshotter:
-    # -- Kubernetes CSI snapshotter image repository
-    repository: registry.k8s.io/sig-storage/csi-snapshotter
-    # -- Snapshotter image tag
-    tag: v8.2.0
-
-  attacher:
-    # -- Kubernetes CSI Attacher image repository
-    repository: registry.k8s.io/sig-storage/csi-attacher
-    # -- Attacher image tag
-    tag: v4.8.0
-
-  resizer:
-    # -- Kubernetes CSI resizer image repository
-    repository: registry.k8s.io/sig-storage/csi-resizer
-    # -- Resizer image tag
-    tag: v1.13.1
-
-  # -- Image pull policy
-  imagePullPolicy: IfNotPresent
-
-  # -- Labels to add to the CSI CephFS Deployments and DaemonSets Pods
-  cephfsPodLabels: #"key1=value1,key2=value2"
-
-  # -- Labels to add to the CSI NFS Deployments and DaemonSets Pods
-  nfsPodLabels: #"key1=value1,key2=value2"
-
-  # -- Labels to add to the CSI RBD Deployments and DaemonSets Pods
-  rbdPodLabels: #"key1=value1,key2=value2"
-
-  csiAddons:
-    # -- Enable CSIAddons
-    enabled: false
-    # -- CSIAddons sidecar image repository
-    repository: quay.io/csiaddons/k8s-sidecar
-    # -- CSIAddons sidecar image tag
-    tag: v0.12.0
-
-  nfs:
-    # -- Enable the nfs csi driver
-    enabled: false
-
-  topology:
-    # -- Enable topology based provisioning
-    enabled: false
-    # NOTE: the value here serves as an example and needs to be
-    # updated with node labels that define domains of interest
-    # -- domainLabels define which node labels to use as domains
-    # for CSI nodeplugins to advertise their domains
-    domainLabels:
-    # - kubernetes.io/hostname
-    # - topology.kubernetes.io/zone
-    # - topology.rook.io/rack
-
-  # -- Whether to skip any attach operation altogether for CephFS PVCs. See more details
-  # [here](https://kubernetes-csi.github.io/docs/skip-attach.html#skip-attach-with-csi-driver-object).
-  # If cephFSAttachRequired is set to false it skips the volume attachments and makes the creation
-  # of pods using the CephFS PVC fast. **WARNING** It's highly discouraged to use this for
-  # CephFS RWO volumes. Refer to this [issue](https://github.com/kubernetes/kubernetes/issues/103305) for more details.
-  cephFSAttachRequired: true
-  # -- Whether to skip any attach operation altogether for RBD PVCs. See more details
-  # [here](https://kubernetes-csi.github.io/docs/skip-attach.html#skip-attach-with-csi-driver-object).
-  # If set to false it skips the volume attachments and makes the creation of pods using the RBD PVC fast.
-  # **WARNING** It's highly discouraged to use this for RWO volumes as it can cause data corruption.
-  # csi-addons operations like Reclaimspace and PVC Keyrotation will also not be supported if set
-  # to false since we'll have no VolumeAttachments to determine which node the PVC is mounted on.
-  # Refer to this [issue](https://github.com/kubernetes/kubernetes/issues/103305) for more details.
-  rbdAttachRequired: true
-  # -- Whether to skip any attach operation altogether for NFS PVCs. See more details
-  # [here](https://kubernetes-csi.github.io/docs/skip-attach.html#skip-attach-with-csi-driver-object).
-  # If cephFSAttachRequired is set to false it skips the volume attachments and makes the creation
-  # of pods using the NFS PVC fast. **WARNING** It's highly discouraged to use this for
-  # NFS RWO volumes. Refer to this [issue](https://github.com/kubernetes/kubernetes/issues/103305) for more details.
-  nfsAttachRequired: true
-
-# -- Enable discovery daemon
-enableDiscoveryDaemon: false
-# -- Set the discovery daemon device discovery interval (default to 60m)
-discoveryDaemonInterval: 60m
-
-# -- The timeout for ceph commands in seconds
-cephCommandsTimeoutSeconds: "15"
-
-# -- If true, run rook operator on the host network
-useOperatorHostNetwork:
-
-# -- If true, scale down the rook operator.
-# This is useful for administrative actions where the rook operator must be scaled down, while using gitops style tooling
-# to deploy your helm charts.
-scaleDownOperator: false
-
-## Rook Discover configuration
-## toleration: NoSchedule, PreferNoSchedule or NoExecute
-## tolerationKey: Set this to the specific key of the taint to tolerate
-## tolerations: Array of tolerations in YAML format which will be added to agent deployment
-## nodeAffinity: Set to labels of the node to match
-
-discover:
-  # -- Toleration for the discover pods.
-  # Options: `NoSchedule`, `PreferNoSchedule` or `NoExecute`
-  toleration:
-  # -- The specific key of the taint to tolerate
-  tolerationKey:
-  # -- Array of tolerations in YAML format which will be added to discover deployment
-  tolerations:
-  #   - key: key
-  #     operator: Exists
-  #     effect: NoSchedule
-  # -- The node labels for affinity of `discover-agent` [^1]
-  nodeAffinity:
-  #   key1=value1,value2; key2=value3
-  #
-  #   or
-  #
-  #   requiredDuringSchedulingIgnoredDuringExecution:
-  #     nodeSelectorTerms:
-  #       - matchExpressions:
-  #           - key: storage-node
-  #             operator: Exists
-  # -- Labels to add to the discover pods
-  podLabels: # "key1=value1,key2=value2"
-  # -- Add resources to discover daemon pods
-  resources:
-  #   - limits:
-  #       memory: 512Mi
-  #   - requests:
-  #       cpu: 100m
-  #       memory: 128Mi
-
-# -- Custom label to identify node hostname. If not set `kubernetes.io/hostname` will be used
-customHostnameLabel:
-
-# -- Runs Ceph Pods as privileged to be able to write to `hostPaths` in OpenShift with SELinux restrictions.
-hostpathRequiresPrivileged: false
-
-# -- Whether to create all Rook pods to run on the host network, for example in environments where a CNI is not enabled
-enforceHostNetwork: false
-
-# -- Disable automatic orchestration when new devices are discovered.
-disableDeviceHotplug: false
-
-# -- The revision history limit for all pods created by Rook. If blank, the K8s default is 10.
-revisionHistoryLimit:
-
-# -- Blacklist certain disks according to the regex provided.
-discoverDaemonUdev:
-
-# -- imagePullSecrets option allow to pull docker images from private docker registry. Option will be passed to all service accounts.
-imagePullSecrets:
-# - name: my-registry-secret
-
-# -- Whether the OBC provisioner should watch on the operator namespace or not, if not the namespace of the cluster will be used
-enableOBCWatchOperatorNamespace: true
-
-# -- Specify the prefix for the OBC provisioner in place of the cluster namespace
-# @default -- `ceph cluster namespace`
-obcProvisionerNamePrefix:
-
-# -- Many OBC additional config fields may be risky for administrators to allow users control over.
-# The safe and default-allowed fields are 'maxObjects' and 'maxSize'.
-# Other fields should be considered risky. To allow all additional configs, use this value:
-#   "maxObjects,maxSize,bucketMaxObjects,bucketMaxSize,bucketPolicy,bucketLifecycle,bucketOwner"
-# @default -- "maxObjects,maxSize"
-obcAllowAdditionalConfigFields: "maxObjects,maxSize"
-
-monitoring:
-  # -- Enable monitoring. Requires Prometheus to be pre-installed.
-  # Enabling will also create RBAC rules to allow Operator to create ServiceMonitors
-  enabled: false
--- a/examples/nanodc/src/main.rs
+++ b/examples/nanodc/src/main.rs
@@ -1,199 +0,0 @@
-use std::{
-    net::{IpAddr, Ipv4Addr},
-    sync::{Arc, OnceLock},
-};
-
-use brocade::BrocadeOptions;
-use cidr::Ipv4Cidr;
-use harmony::{
-    config::secret::SshKeyPair,
-    data::{FileContent, FilePath},
-    hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
-    infra::{brocade::BrocadeSwitchClient, opnsense::OPNSenseManagementInterface},
-    inventory::Inventory,
-    modules::{
-        http::StaticFilesHttpScore,
-        okd::{
-            bootstrap_dhcp::OKDBootstrapDhcpScore,
-            bootstrap_load_balancer::OKDBootstrapLoadBalancerScore, dhcp::OKDDhcpScore,
-            dns::OKDDnsScore, ipxe::OKDIpxeScore,
-        },
-        tftp::TftpScore,
-    },
-    topology::{LogicalHost, UnmanagedRouter},
-};
-use harmony_macros::{ip, mac_address};
-use harmony_secret::{Secret, SecretManager};
-use harmony_types::net::Url;
-use serde::{Deserialize, Serialize};
-
-#[tokio::main]
-async fn main() {
-    let firewall = harmony::topology::LogicalHost {
-        ip: ip!("192.168.33.1"),
-        name: String::from("fw0"),
-    };
-
-    let switch_auth = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
-        .await
-        .expect("Failed to get credentials");
-
-    let switches: Vec<IpAddr> = vec![ip!("192.168.33.101")];
-    let brocade_options = BrocadeOptions {
-        dry_run: *harmony::config::DRY_RUN,
-        ..Default::default()
-    };
-    let switch_client = BrocadeSwitchClient::init(
-        &switches,
-        &switch_auth.username,
-        &switch_auth.password,
-        brocade_options,
-    )
-    .await
-    .expect("Failed to connect to switch");
-
-    let switch_client = Arc::new(switch_client);
-
-    let opnsense = Arc::new(
-        harmony::infra::opnsense::OPNSenseFirewall::new(firewall, None, "root", "opnsense").await,
-    );
-    let lan_subnet = Ipv4Addr::new(192, 168, 33, 0);
-    let gateway_ipv4 = Ipv4Addr::new(192, 168, 33, 1);
-    let gateway_ip = IpAddr::V4(gateway_ipv4);
-    let topology = harmony::topology::HAClusterTopology {
-        kubeconfig: None,
-        domain_name: "ncd0.harmony.mcd".to_string(), // TODO this must be set manually correctly
-        // when setting up the opnsense firewall
-        router: Arc::new(UnmanagedRouter::new(
-            gateway_ip,
-            Ipv4Cidr::new(lan_subnet, 24).unwrap(),
-        )),
-        load_balancer: opnsense.clone(),
-        firewall: opnsense.clone(),
-        tftp_server: opnsense.clone(),
-        http_server: opnsense.clone(),
-        dhcp_server: opnsense.clone(),
-        dns_server: opnsense.clone(),
-        control_plane: vec![
-            LogicalHost {
-                ip: ip!("192.168.33.20"),
-                name: "cp0".to_string(),
-            },
-            LogicalHost {
-                ip: ip!("192.168.33.21"),
-                name: "cp1".to_string(),
-            },
-            LogicalHost {
-                ip: ip!("192.168.33.22"),
-                name: "cp2".to_string(),
-            },
-        ],
-        bootstrap_host: LogicalHost {
-            ip: ip!("192.168.33.66"),
-            name: "bootstrap".to_string(),
-        },
-        workers: vec![
-            LogicalHost {
-                ip: ip!("192.168.33.30"),
-                name: "wk0".to_string(),
-            },
-            LogicalHost {
-                ip: ip!("192.168.33.31"),
-                name: "wk1".to_string(),
-            },
-            LogicalHost {
-                ip: ip!("192.168.33.32"),
-                name: "wk2".to_string(),
-            },
-        ],
-        node_exporter: opnsense.clone(),
-        switch_client: switch_client.clone(),
-        network_manager: OnceLock::new(),
-    };
-
-    let inventory = Inventory {
-        location: Location::new("I am mobile".to_string(), "earth".to_string()),
-        switch: SwitchGroup::from([]),
-        firewall_mgmt: Box::new(OPNSenseManagementInterface::new()),
-        storage_host: vec![],
-        worker_host: vec![
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:02:61:0F")),
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:02:61:26")),
-            // thisone
-            // Then create the ipxe file
-            // set the dns static leases
-            // bootstrap nodes
-            // start ceph cluster
-            // try installation of lampscore
-            // bingo?
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:02:61:70")),
-        ],
-        control_plane_host: vec![
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:02:60:FA")),
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:02:61:1A")),
-            PhysicalHost::empty(HostCategory::Server)
-                .mac_address(mac_address!("C4:62:37:01:BC:68")),
-        ],
-    };
-
-    // TODO regroup smaller scores in a larger one such as this
-    // let okd_boostrap_preparation();
-
-    let bootstrap_dhcp_score = OKDBootstrapDhcpScore::new(&topology, &inventory);
-    let bootstrap_load_balancer_score = OKDBootstrapLoadBalancerScore::new(&topology);
-    let dhcp_score = OKDDhcpScore::new(&topology, &inventory);
-    let dns_score = OKDDnsScore::new(&topology);
-    let load_balancer_score =
-        harmony::modules::okd::load_balancer::OKDLoadBalancerScore::new(&topology);
-
-    let ssh_key = SecretManager::get_or_prompt::<SshKeyPair>().await.unwrap();
-
-    let tftp_score = TftpScore::new(Url::LocalFolder("./data/watchguard/tftpboot".to_string()));
-    let http_score = StaticFilesHttpScore {
-        folder_to_serve: Some(Url::LocalFolder(
-            "./data/watchguard/pxe-http-files".to_string(),
-        )),
-        files: vec![],
-        remote_path: None,
-    };
-
-    let kickstart_filename = "inventory.kickstart".to_string();
-    let harmony_inventory_agent = "harmony_inventory_agent".to_string();
-
-    let ipxe_score = OKDIpxeScore {
-        kickstart_filename,
-        harmony_inventory_agent,
-        cluster_pubkey: FileContent {
-            path: FilePath::Relative("cluster_ssh_key.pub".to_string()),
-            content: ssh_key.public,
-        },
-    };
-
-    harmony_tui::run(
-        inventory,
-        topology,
-        vec![
-            Box::new(dns_score),
-            Box::new(bootstrap_dhcp_score),
-            Box::new(bootstrap_load_balancer_score),
-            Box::new(load_balancer_score),
-            Box::new(tftp_score),
-            Box::new(http_score),
-            Box::new(ipxe_score),
-            Box::new(dhcp_score),
-        ],
-    )
-    .await
-    .unwrap();
-}
-
-#[derive(Secret, Serialize, Deserialize, Debug)]
-pub struct BrocadeSwitchAuth {
-    pub username: String,
-    pub password: String,
-}
--- a/examples/nats-module/Cargo.toml
+++ b/examples/nats-module/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "example-ha-cluster"
+name = "example-nats-module-supercluster"
 edition = "2024"
 version.workspace = true
 readme.workspace = true
@@ -8,7 +8,7 @@ publish = false

 [dependencies]
 harmony = { path = "../../harmony" }
-harmony_tui = { path = "../../harmony_tui" }
+harmony_cli = { path = "../../harmony_cli" }
 harmony_types = { path = "../../harmony_types" }
 cidr = { workspace = true }
 tokio = { workspace = true }
@@ -16,6 +16,4 @@ harmony_macros = { path = "../../harmony_macros" }
 log = { workspace = true }
 env_logger = { workspace = true }
 url = { workspace = true }
-harmony_secret = { path = "../../harmony_secret" }
-brocade = { path = "../../brocade" }
-serde = { workspace = true }
+k8s-openapi.workspace = true
--- a/examples/nats-module/env_example.sh
+++ b/examples/nats-module/env_example.sh
@@ -0,0 +1,9 @@
+# Cluster 1
+export HARMONY_DECENTRALIZED_TOPOLOGY_K8S_SITE_1="kubeconfig=$HOME/.kube/config,context=cluster-context"
+export HARMONY_NATS_SITE_1_DOMAIN="your.domain.1"
+# Cluster 2
+export HARMONY_DECENTRALIZED_TOPOLOGY_K8S_SITE_2="kubeconfig=$HOME/.kube/config,context=cluster-context"
+export HARMONY_NATS_SITE_2_DOMAIN="your.domain.2"
+# Cluster 3
+export HARMONY_DECENTRALIZED_TOPOLOGY_K8S_SITE_3="kubeconfig=$HOME/.kube/config,context=cluster-context"
+export HARMONY_NATS_SITE_3_DOMAIN="your.domain.3"
--- a/examples/nats-module/src/main.rs
+++ b/examples/nats-module/src/main.rs
@@ -0,0 +1,77 @@
+use harmony::{
+    inventory::Inventory,
+    modules::nats::{capability::NatsCluster, score_nats_supercluster::NatsSuperclusterScore},
+    topology::{K8sAnywhereTopology, decentralized::DecentralizedTopology},
+};
+
+#[tokio::main]
+async fn main() {
+    let supercluster_ca_secret_name = "nats-supercluster-ca-bundle";
+    let tls_cert_name = "nats-gateway";
+    let jetstream_enabled = "false";
+    let nats_namespace = "nats-example".to_string();
+
+    let site_1_name = "site-1".to_string();
+    let site_1_domain =
+        std::env::var("HARMONY_NATS_SITE_1_DOMAIN").expect("missing domain in env for site_1");
+
+    let nats_site_1 = NatsCluster {
+        namespace: nats_namespace.clone(),
+        domain: site_1_domain.clone(),
+        replicas: 1,
+        name: site_1_name.clone(),
+        gateway_advertise: format!("{site_1_name}-gw.{site_1_domain}:443"),
+        dns_name: format!("{site_1_name}-gw.{site_1_domain}"),
+        supercluster_ca_secret_name: supercluster_ca_secret_name,
+        tls_cert_name: tls_cert_name,
+        jetstream_enabled: jetstream_enabled,
+    };
+
+    let site_2_name = "site-2".to_string();
+    let site_2_domain =
+        std::env::var("HARMONY_NATS_SITE_2_DOMAIN").expect("missing domain in env for site_2");
+
+    let nats_site_2 = NatsCluster {
+        namespace: nats_namespace.clone(),
+        domain: site_2_domain.clone(),
+        replicas: 1,
+        name: site_2_name.clone(),
+        gateway_advertise: format!("{site_2_name}-gw.{site_2_domain}:443"),
+        dns_name: format!("{site_2_name}-gw.{site_2_domain}"),
+        supercluster_ca_secret_name: supercluster_ca_secret_name,
+        tls_cert_name: tls_cert_name,
+        jetstream_enabled: jetstream_enabled,
+    };
+
+    let site_3_name = "site-3".to_string();
+    let site_3_domain =
+        std::env::var("HARMONY_NATS_SITE_3_DOMAIN").expect("missing domain in env for site_3");
+
+    let nats_site_3 = NatsCluster {
+        namespace: nats_namespace.clone(),
+        domain: site_3_domain.clone(),
+        replicas: 1,
+        name: site_3_name.clone(),
+        gateway_advertise: format!("{site_3_name}-gw.{site_3_domain}:443"),
+        dns_name: format!("{site_3_name}-gw.{site_3_domain}"),
+        supercluster_ca_secret_name: supercluster_ca_secret_name,
+        tls_cert_name: tls_cert_name,
+        jetstream_enabled: jetstream_enabled,
+    };
+
+    let clusters = vec![nats_site_1, nats_site_2, nats_site_3];
+
+    let nats_supercluster = NatsSuperclusterScore {
+        nats_cluster: clusters,
+        ca_certs: None,
+    };
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        DecentralizedTopology::<K8sAnywhereTopology>::from_env(),
+        vec![Box::new(nats_supercluster)],
+        None,
+    )
+    .await
+    .unwrap();
+}
--- a/examples/nats-supercluster/Cargo.toml
+++ b/examples/nats-supercluster/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "example-nats-supercluster"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+publish = false
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+cidr = { workspace = true }
+tokio = { workspace = true }
+harmony_macros = { path = "../../harmony_macros" }
+log = { workspace = true }
+env_logger = { workspace = true }
+url = { workspace = true }
+k8s-openapi.workspace = true
--- a/examples/nats-supercluster/env_example.sh
+++ b/examples/nats-supercluster/env_example.sh
@@ -0,0 +1,6 @@
+# Cluster 1
+export HARMONY_NATS_SITE_1="kubeconfig=$HOME/.config/nt/kube/config,context=your_cluster_1_kube_context_name"
+export HARMONY_NATS_SITE_1_DOMAIN="your_cluster_1_public_domain"
+# Cluster 2
+export HARMONY_NATS_SITE_2="kubeconfig=$HOME/.config/nt/kube/config,context=your_cluster_2_kube_context_name"
+export HARMONY_NATS_SITE_2_DOMAIN="your_cluster_2_public_domain"
--- a/examples/nats-supercluster/src/main.rs
+++ b/examples/nats-supercluster/src/main.rs
@@ -0,0 +1,481 @@
+use std::{collections::BTreeMap, str::FromStr};
+
+use harmony::{
+    interpret::{InterpretError, Outcome},
+    inventory::Inventory,
+    modules::{
+        cert_manager::{
+            capability::{CertificateManagement, CertificateManagementConfig},
+            crd::CaIssuer,
+        },
+        helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
+        k8s::resource::K8sResourceScore,
+        okd::{
+            crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig},
+            route::OKDRouteScore,
+        },
+    },
+    score::Score,
+    topology::{
+        HelmCommand, K8sAnywhereConfig, K8sAnywhereTopology, K8sclient, TlsRouter, Topology,
+    },
+};
+use harmony_macros::hurl;
+use k8s_openapi::{
+    ByteString, api::core::v1::Secret, apimachinery::pkg::apis::meta::v1::ObjectMeta,
+};
+use log::{debug, info};
+
+#[tokio::main]
+async fn main() -> Result<(), InterpretError> {
+    let namespace = "nats-supercluster-test";
+    let self_signed_issuer_name = "harmony-self-signed-issuer";
+    let ca_issuer_name = "harmony-ca-issuer";
+    let root_ca_cert_name = "harmony-root-ca";
+
+    log::info!("starting nats supercluster bootstrap");
+
+    // --------------------------------------------------
+    // 1. Build site contexts
+    // --------------------------------------------------
+
+    let site1 = site(
+        "HARMONY_NATS_SITE_1",
+        "HARMONY_NATS_SITE_1_DOMAIN",
+        "nats-sto1-cert-test1",
+    );
+
+    let site2 = site(
+        "HARMONY_NATS_SITE_2",
+        "HARMONY_NATS_SITE_2_DOMAIN",
+        "nats-cb1-cert-test2",
+    );
+
+    // --------------------------------------------------
+    // 2. Ensure clusters are reachable
+    // --------------------------------------------------
+
+    log::info!("ensuring both topologies are ready");
+
+    tokio::try_join!(site1.topology.ensure_ready(), site2.topology.ensure_ready(),)?;
+
+    // --------------------------------------------------
+    // 3. Create certificates
+    // --------------------------------------------------
+
+    log::info!("creating certificates");
+
+    let root_ca_config = CertificateManagementConfig {
+        namespace: Some(namespace.into()),
+        acme_issuer: None,
+        ca_issuer: Some(CaIssuer {
+            secret_name: format!("{}-tls", root_ca_cert_name),
+        }),
+        self_signed: false,
+    };
+
+    let self_signed_config = CertificateManagementConfig {
+        namespace: Some(namespace.to_string().clone()),
+        acme_issuer: None,
+        ca_issuer: None,
+        self_signed: true,
+    };
+
+    tokio::try_join!(
+        create_nats_certs(
+            site1.topology.clone(),
+            &site1.cluster,
+            ca_issuer_name,
+            &root_ca_config,
+            self_signed_issuer_name,
+            &self_signed_config,
+            root_ca_cert_name
+        ),
+        create_nats_certs(
+            site2.topology.clone(),
+            &site2.cluster,
+            ca_issuer_name,
+            &root_ca_config,
+            self_signed_issuer_name,
+            &self_signed_config,
+            root_ca_cert_name
+        ),
+    )?;
+
+    // --------------------------------------------------
+    // 4. Build CA bundle
+    // --------------------------------------------------
+
+    log::info!("building supercluster CA bundle");
+
+    let mut ca_bundle = Vec::new();
+
+    ca_bundle.push(
+        site1
+            .topology
+            .get_ca_certificate(root_ca_cert_name.to_string(), &root_ca_config)
+            .await?,
+    );
+    ca_bundle.push(
+        site2
+            .topology
+            .get_ca_certificate(root_ca_cert_name.to_string(), &root_ca_config)
+            .await?,
+    );
+
+    // --------------------------------------------------
+    // 5. Build Scores
+    // --------------------------------------------------
+
+    log::info!("building scores");
+
+    let site1_scores = vec![
+        build_ca_bundle_secret_score(
+            site1.topology.clone(),
+            &site1.cluster,
+            &ca_bundle,
+            namespace.into(),
+        )
+        .await,
+        build_route_score(site1.topology.clone(), &site1.cluster, namespace.into()).await,
+        build_deploy_nats_score(
+            site1.topology.clone(),
+            &site1.cluster,
+            vec![&site2.cluster],
+            namespace.into(),
+        )
+        .await,
+    ];
+
+    let site2_scores = vec![
+        build_ca_bundle_secret_score(
+            site2.topology.clone(),
+            &site2.cluster,
+            &ca_bundle,
+            namespace.into(),
+        )
+        .await,
+        build_route_score(site2.topology.clone(), &site2.cluster, namespace.into()).await,
+        build_deploy_nats_score(
+            site2.topology.clone(),
+            &site2.cluster,
+            vec![&site1.cluster],
+            namespace.into(),
+        )
+        .await,
+    ];
+
+    // --------------------------------------------------
+    // 6. Apply Scores
+    // --------------------------------------------------
+
+    log::info!("applying scores");
+
+    tokio::try_join!(
+        apply_scores(site1.topology.clone(), site1_scores),
+        apply_scores(site2.topology.clone(), site2_scores),
+    )?;
+
+    log::info!("supercluster bootstrap complete");
+    log::info!(
+        "Enjoy! You can test your nats cluster by running : `kubectl exec -n {namespace} -it deployment/nats-box -- nats pub test hi`"
+    );
+    Ok(())
+}
+
+async fn apply_scores<T: Topology + 'static>(
+    topology: T,
+    scores: Vec<Box<dyn Score<T>>>,
+) -> Result<(), InterpretError> {
+    info!("applying {} scores", scores.len());
+
+    harmony_cli::run(Inventory::autoload(), topology, scores, None)
+        .await
+        .map_err(|e| InterpretError::new(e.to_string()))?;
+
+    Ok(())
+}
+
+fn site(
+    topo_env: &str,
+    domain_env: &str,
+    cluster_name: &'static str,
+) -> SiteContext<K8sAnywhereTopology> {
+    let domain = std::env::var(domain_env).expect("missing domain env");
+
+    let topology =
+        K8sAnywhereTopology::with_config(K8sAnywhereConfig::remote_k8s_from_env_var(topo_env));
+
+    SiteContext {
+        topology,
+        cluster: NatsCluster {
+            replicas: 1,
+            name: cluster_name,
+            gateway_advertise: format!("{cluster_name}-gw.{domain}:443"),
+            dns_name: format!("{cluster_name}-gw.{domain}"),
+            supercluster_ca_secret_name: "nats-supercluster-ca-bundle",
+            tls_cert_name: "nats-gateway",
+            jetstream_enabled: "true",
+        },
+    }
+}
+
+struct SiteContext<T> {
+    topology: T,
+    cluster: NatsCluster,
+}
+
+struct NatsCluster {
+    replicas: usize,
+    name: &'static str,
+    gateway_advertise: String,
+    dns_name: String,
+    supercluster_ca_secret_name: &'static str,
+    tls_cert_name: &'static str,
+    jetstream_enabled: &'static str,
+}
+
+async fn create_nats_certs<T: Topology + CertificateManagement>(
+    topology: T,
+    cluster: &NatsCluster,
+    ca_issuer_name: &str,
+    ca_cert_mgmt_config: &CertificateManagementConfig,
+    self_signed_issuer_name: &str,
+    self_signed_cert_config: &CertificateManagementConfig,
+    root_ca_cert_name: &str,
+) -> Result<Outcome, InterpretError> {
+    //the order is pretty important
+
+    debug!(
+        "Applying certs to ns {:#?}",
+        ca_cert_mgmt_config.namespace.clone()
+    );
+
+    debug!("creating issuer '{}'", self_signed_issuer_name);
+    topology
+        .create_issuer(
+            self_signed_issuer_name.to_string(),
+            &self_signed_cert_config,
+        )
+        .await?;
+
+    debug!("creating certificate {root_ca_cert_name}");
+    topology
+        .create_certificate(
+            root_ca_cert_name.to_string(),
+            self_signed_issuer_name.to_string(),
+            Some(format!("harmony-{}-ca", cluster.name)),
+            None,
+            Some(true),
+            ca_cert_mgmt_config,
+        )
+        .await?;
+
+    debug!("creating issuer '{}'", ca_issuer_name);
+    topology
+        .create_issuer(ca_issuer_name.to_string(), ca_cert_mgmt_config)
+        .await?;
+
+    debug!("creating certificate {}", cluster.tls_cert_name);
+    topology
+        .create_certificate(
+            cluster.tls_cert_name.to_string(),
+            ca_issuer_name.to_string(),
+            None,
+            Some(vec![cluster.dns_name.clone()]),
+            Some(true),
+            ca_cert_mgmt_config,
+        )
+        .await?;
+
+    Ok(Outcome::success("success".to_string()))
+}
+
+async fn build_ca_bundle_secret(
+    namespace: &str,
+    nats_cluster: &NatsCluster,
+    bundle: &Vec<String>,
+) -> Secret {
+    Secret {
+        metadata: ObjectMeta {
+            name: Some(nats_cluster.supercluster_ca_secret_name.to_string()),
+            namespace: Some(namespace.to_string()),
+            ..Default::default()
+        },
+        data: Some(build_secret_data(bundle).await),
+        immutable: Some(false),
+        type_: Some("Opaque".to_string()),
+        string_data: None,
+    }
+}
+
+async fn build_secret_data(bundle: &Vec<String>) -> BTreeMap<String, ByteString> {
+    let mut data = BTreeMap::new();
+
+    data.insert(
+        "ca.crt".to_string(),
+        ByteString(bundle.join("\n").into_bytes()),
+    );
+
+    data
+}
+
+async fn build_ca_bundle_secret_score<T: Topology + K8sclient + 'static>(
+    _topology: T,
+    nats_cluster: &NatsCluster,
+    ca_bundle: &Vec<String>,
+    namespace: String,
+) -> Box<dyn Score<T>> {
+    let bundle_secret = build_ca_bundle_secret(&namespace, nats_cluster, ca_bundle).await;
+    debug!(
+        "deploying secret to ns: {} \nsecret: {:#?}",
+        namespace, bundle_secret
+    );
+    let k8ssecret = K8sResourceScore::single(bundle_secret, Some(namespace));
+    Box::new(k8ssecret)
+}
+
+async fn build_route_score<T: Topology + K8sclient + 'static>(
+    _topology: T,
+    cluster: &NatsCluster,
+    namespace: String,
+) -> Box<dyn Score<T>> {
+    let route = OKDRouteScore {
+        name: cluster.name.to_string(),
+        namespace,
+        spec: RouteSpec {
+            to: RouteTargetReference {
+                kind: "Service".to_string(),
+                name: cluster.name.to_string(),
+                weight: Some(100),
+            },
+            host: Some(cluster.dns_name.clone()),
+            port: Some(RoutePort { target_port: 7222 }),
+            tls: Some(TLSConfig {
+                insecure_edge_termination_policy: None,
+                termination: "passthrough".to_string(),
+                ..Default::default()
+            }),
+            wildcard_policy: None,
+            ..Default::default()
+        },
+    };
+    Box::new(route)
+}
+
+async fn build_deploy_nats_score<T: Topology + HelmCommand + TlsRouter + 'static>(
+    topology: T,
+    cluster: &NatsCluster,
+    peers: Vec<&NatsCluster>,
+    namespace: String,
+) -> Box<dyn Score<T>> {
+    let mut gateway_gateways = String::new();
+    for peer in peers {
+        // Construct wss:// URLs on port 443 for the remote gateways
+        gateway_gateways.push_str(&format!(
+            r#"
+        - name: {}
+          urls:
+            - nats://{}"#,
+            peer.name, peer.gateway_advertise
+        ));
+    }
+    let domain = topology.get_internal_domain().await.unwrap().unwrap();
+
+    // Inject gateway config into the 'merge' block to comply with chart structure
+    let values_yaml = Some(format!(
+        r#"config:
+  merge:
+    authorization:
+      default_permissions:
+        publish: ["TEST.*"]
+        subscribe:  ["PUBLIC.>"]
+      users:
+      # - user: "admin"
+      #   password: "admin_1"
+      #   permissions:
+      #       publish: ">"
+      #       subscribe: ">"
+      - password: "enGk0cgZUabM6bN6FXHT"
+        user: "testUser"
+    accounts:
+      system:
+          users:
+              - user: "admin"
+                password: "admin_2"
+    logtime: true
+    debug: true
+    trace: true
+    system_account: system
+  cluster:
+    name: {cluster_name}
+    enabled: true
+    replicas: {replicas}
+  jetstream:
+    enabled: {jetstream_enabled}
+    fileStorage:
+      enabled: true
+      size: 10Gi
+      storageDirectory: /data/jetstream
+  leafnodes:
+    enabled: false
+  websocket:
+    enabled: false
+    ingress:
+      enabled: true
+      className: openshift-default
+      pathType: Prefix
+      hosts: 
+        - nats-ws.{domain}
+  gateway:
+    enabled: true
+    port: 7222
+    name: {cluster_name}
+    merge:
+        advertise: {gateway_advertise}
+        gateways: {gateway_gateways}
+    tls:
+        enabled: true
+        secretName: {tls_secret_name}
+        # merge:
+        #    ca_file: "/etc/nats-certs/gateway/ca.crt"
+service:
+  ports:
+    gateway:
+      enabled: true
+tlsCA:
+    enabled: true
+    secretName: {supercluster_ca_secret_name}
+natsBox:
+  container:
+    image:
+      tag: nonroot"#,
+        cluster_name = cluster.name,
+        replicas = cluster.replicas,
+        domain = domain,
+        gateway_gateways = gateway_gateways,
+        gateway_advertise = cluster.gateway_advertise,
+        tls_secret_name = format!("{}-tls", cluster.tls_cert_name),
+        jetstream_enabled = cluster.jetstream_enabled,
+        supercluster_ca_secret_name = cluster.supercluster_ca_secret_name,
+    ));
+
+    debug!("Prepared Helm Chart values : \n{values_yaml:#?}");
+    let nats = HelmChartScore {
+        namespace: Some(NonBlankString::from_str(&namespace).unwrap()),
+        release_name: NonBlankString::from_str(&cluster.name).unwrap(),
+        chart_name: NonBlankString::from_str("nats/nats").unwrap(),
+        chart_version: None,
+        values_overrides: None,
+        values_yaml,
+        create_namespace: true,
+        install_only: false,
+        repository: Some(HelmRepository::new(
+            "nats".to_string(),
+            hurl!("https://nats-io.github.io/k8s/helm/charts/"),
+            true,
+        )),
+    };
+
+    Box::new(nats)
+}
--- a/examples/nats/src/main.rs
+++ b/examples/nats/src/main.rs
@@ -3,15 +3,58 @@ use std::str::FromStr;
 use harmony::{
    inventory::Inventory,
    modules::helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
-    topology::K8sAnywhereTopology,
+    topology::{HelmCommand, K8sAnywhereConfig, K8sAnywhereTopology, TlsRouter, Topology},
 };
 use harmony_macros::hurl;
 use log::info;

 #[tokio::main]
 async fn main() {
-    // env_logger::init();
-    let values_yaml = Some(
+    let site1_topo = K8sAnywhereTopology::with_config(K8sAnywhereConfig::remote_k8s_from_env_var(
+        "HARMONY_NATS_SITE_1",
+    ));
+    let site2_topo = K8sAnywhereTopology::with_config(K8sAnywhereConfig::remote_k8s_from_env_var(
+        "HARMONY_NATS_SITE_2",
+    ));
+
+    let site1_domain = site1_topo.get_internal_domain().await.unwrap().unwrap();
+    let site2_domain = site2_topo.get_internal_domain().await.unwrap().unwrap();
+
+    let site1_gateway = format!("nats-gateway.{}", site1_domain);
+    let site2_gateway = format!("nats-gateway.{}", site2_domain);
+
+    tokio::join!(
+        deploy_nats(
+            site1_topo,
+            "site-1",
+            vec![("site-2".to_string(), site2_gateway)]
+        ),
+        deploy_nats(
+            site2_topo,
+            "site-2",
+            vec![("site-1".to_string(), site1_gateway)]
+        ),
+    );
+}
+
+async fn deploy_nats<T: Topology + HelmCommand + TlsRouter + 'static>(
+    topology: T,
+    cluster_name: &str,
+    remote_gateways: Vec<(String, String)>,
+) {
+    topology.ensure_ready().await.unwrap();
+
+    let mut gateway_gateways = String::new();
+    for (name, url) in remote_gateways {
+        gateway_gateways.push_str(&format!(
+            r#"
+      - name: {name}
+        urls:
+          - nats://{url}:7222"#
+        ));
+    }
+
+    let values_yaml = Some(format!(
        r#"config:
  cluster:
    enabled: true
@@ -25,16 +68,31 @@ async fn main() {
  leafnodes:
    enabled: false
    # port: 7422
+  websocket:
+    enabled: true
+    ingress:
+      enabled: true
+      className: openshift-default
+      pathType: Prefix
+      hosts: 
+        - nats-ws.{}
  gateway:
-    enabled: false
-    # name: my-gateway
-    # port: 7522
+    enabled: true
+    name: {}
+    port: 7222
+    gateways: {}
+service:
+  ports:
+    gateway:
+      enabled: true
 natsBox:
  container:
    image:
-      tag: nonroot"#
-            .to_string(),
-    );
+      tag: nonroot"#,
+        topology.get_internal_domain().await.unwrap().unwrap(),
+        cluster_name,
+        gateway_gateways,
+    ));
    let namespace = "nats";
    let nats = HelmChartScore {
        namespace: Some(NonBlankString::from_str(namespace).unwrap()),
@@ -52,14 +110,9 @@ natsBox:
        )),
    };

-    harmony_cli::run(
-        Inventory::autoload(),
-        K8sAnywhereTopology::from_env(),
-        vec![Box::new(nats)],
-        None,
-    )
-    .await
-    .unwrap();
+    harmony_cli::run(Inventory::autoload(), topology, vec![Box::new(nats)], None)
+        .await
+        .unwrap();

    info!(
        "Enjoy! You can test your nats cluster by running : `kubectl exec -n {namespace} -it deployment/nats-box -- nats pub test hi`"
--- a/examples/node_health/Cargo.toml
+++ b/examples/node_health/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "example-node-health"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+publish = false
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+tokio = { workspace = true }
+harmony_macros = { path = "../../harmony_macros" }
+log = { workspace = true }
+env_logger = { workspace = true }
--- a/examples/node_health/src/main.rs
+++ b/examples/node_health/src/main.rs
@@ -0,0 +1,17 @@
+use harmony::{
+    inventory::Inventory, modules::node_health::NodeHealthScore, topology::K8sAnywhereTopology,
+};
+
+#[tokio::main]
+async fn main() {
+    let node_health = NodeHealthScore {};
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        K8sAnywhereTopology::from_env(),
+        vec![Box::new(node_health)],
+        None,
+    )
+    .await
+    .unwrap();
+}
--- a/examples/okd_cluster_alerts/src/main.rs
+++ b/examples/okd_cluster_alerts/src/main.rs
@@ -1,35 +1,64 @@
-use std::collections::HashMap;
-
 use harmony::{
    inventory::Inventory,
    modules::monitoring::{
-        alert_channel::discord_alert_channel::DiscordWebhook,
-        okd::cluster_monitoring::OpenshiftClusterAlertScore,
+        alert_channel::discord_alert_channel::DiscordReceiver,
+        alert_rule::{
+            alerts::{
+                infra::opnsense::high_http_error_rate, k8s::pvc::high_pvc_fill_rate_over_two_days,
+            },
+            prometheus_alert_rule::AlertManagerRuleGroup,
+        },
+        okd::openshift_cluster_alerting_score::OpenshiftClusterAlertScore,
+        scrape_target::prometheus_node_exporter::PrometheusNodeExporter,
+    },
+    topology::{
+        K8sAnywhereTopology,
+        monitoring::{AlertMatcher, AlertRoute, MatchOp},
    },
-    topology::K8sAnywhereTopology,
 };
-use harmony_macros::hurl;
-use harmony_types::k8s_name::K8sName;
+
+use harmony_macros::{hurl, ip};

 #[tokio::main]
 async fn main() {
-    let mut sel = HashMap::new();
-    sel.insert(
-        "openshift_io_alert_source".to_string(),
-        "platform".to_string(),
-    );
-    let mut sel2 = HashMap::new();
-    sel2.insert("openshift_io_alert_source".to_string(), "".to_string());
-    let selectors = vec![sel, sel2];
+    let platform_matcher = AlertMatcher {
+        label: "prometheus".to_string(),
+        operator: MatchOp::Eq,
+        value: "openshift-monitoring/k8s".to_string(),
+    };
+    let severity = AlertMatcher {
+        label: "severity".to_string(),
+        operator: MatchOp::Eq,
+        value: "critical".to_string(),
+    };
+
+    let high_http_error_rate = high_http_error_rate();
+
+    let additional_rules = AlertManagerRuleGroup::new("test-rule", vec![high_http_error_rate]);
+
+    let scrape_target = PrometheusNodeExporter {
+        job_name: "firewall".to_string(),
+        metrics_path: "/metrics".to_string(),
+        listen_address: ip!("192.168.1.1"),
+        port: 9100,
+        ..Default::default()
+    };
+
    harmony_cli::run(
        Inventory::autoload(),
        K8sAnywhereTopology::from_env(),
        vec![Box::new(OpenshiftClusterAlertScore {
-            receivers: vec![Box::new(DiscordWebhook {
-                name: K8sName("wills-discord-webhook-example".to_string()),
-                url: hurl!("https://something.io"),
-                selectors: selectors,
+            receivers: vec![Box::new(DiscordReceiver {
+                name: "crit-wills-discord-channel-example".to_string(),
+                url: hurl!("https://test.io"),
+                route: AlertRoute {
+                    matchers: vec![severity],
+                    ..AlertRoute::default("crit-wills-discord-channel-example".to_string())
+                },
            })],
+            sender: harmony::modules::monitoring::okd::OpenshiftClusterAlertSender,
+            rules: vec![Box::new(additional_rules)],
+            scrape_targets: Some(vec![Box::new(scrape_target)]),
        })],
        None,
    )
--- a/examples/okd_installation/Cargo.toml
+++ b/examples/okd_installation/Cargo.toml
@@ -20,3 +20,4 @@ env_logger = { workspace = true }
 url = { workspace = true }
 serde.workspace = true
 brocade = { path = "../../brocade" }
+schemars = "0.8"
--- a/examples/okd_installation/src/topology.rs
+++ b/examples/okd_installation/src/topology.rs
@@ -2,19 +2,24 @@ use brocade::BrocadeOptions;
 use cidr::Ipv4Cidr;
 use harmony::{
    hardware::{Location, SwitchGroup},
-    infra::{brocade::BrocadeSwitchClient, opnsense::OPNSenseManagementInterface},
+    infra::{
+        brocade::{BrocadeSwitchClient, BrocadeSwitchConfig},
+        opnsense::OPNSenseManagementInterface,
+    },
    inventory::Inventory,
+    modules::brocade::BrocadeSwitchAuth,
    topology::{HAClusterTopology, LogicalHost, UnmanagedRouter},
 };
 use harmony_macros::{ip, ipv4};
 use harmony_secret::{Secret, SecretManager};
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::{
    net::IpAddr,
    sync::{Arc, OnceLock},
 };

-#[derive(Secret, Serialize, Deserialize, Debug, PartialEq)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
 struct OPNSenseFirewallConfig {
    username: String,
    password: String,
@@ -35,12 +40,11 @@ pub async fn get_topology() -> HAClusterTopology {
        dry_run: *harmony::config::DRY_RUN,
        ..Default::default()
    };
-    let switch_client = BrocadeSwitchClient::init(
-        &switches,
-        &switch_auth.username,
-        &switch_auth.password,
-        brocade_options,
-    )
+    let switch_client = BrocadeSwitchClient::init(BrocadeSwitchConfig {
+        ips: switches,
+        auth: switch_auth,
+        options: brocade_options,
+    })
    .await
    .expect("Failed to connect to switch");

@@ -102,9 +106,3 @@ pub fn get_inventory() -> Inventory {
        control_plane_host: vec![],
    }
 }
-
-#[derive(Secret, Serialize, Deserialize, Debug)]
-pub struct BrocadeSwitchAuth {
-    pub username: String,
-    pub password: String,
-}
--- a/examples/okd_pxe/Cargo.toml
+++ b/examples/okd_pxe/Cargo.toml
@@ -20,3 +20,4 @@ env_logger = { workspace = true }
 url = { workspace = true }
 serde.workspace = true
 brocade = { path = "../../brocade" }
+schemars = "0.8"
--- a/examples/okd_pxe/src/topology.rs
+++ b/examples/okd_pxe/src/topology.rs
@@ -3,13 +3,16 @@ use cidr::Ipv4Cidr;
 use harmony::{
    config::secret::OPNSenseFirewallCredentials,
    hardware::{Location, SwitchGroup},
-    infra::{brocade::BrocadeSwitchClient, opnsense::OPNSenseManagementInterface},
+    infra::{
+        brocade::{BrocadeSwitchClient, BrocadeSwitchConfig},
+        opnsense::OPNSenseManagementInterface,
+    },
    inventory::Inventory,
+    modules::brocade::BrocadeSwitchAuth,
    topology::{HAClusterTopology, LogicalHost, UnmanagedRouter},
 };
 use harmony_macros::{ip, ipv4};
-use harmony_secret::{Secret, SecretManager};
-use serde::{Deserialize, Serialize};
+use harmony_secret::SecretManager;
 use std::{
    net::IpAddr,
    sync::{Arc, OnceLock},
@@ -30,12 +33,11 @@ pub async fn get_topology() -> HAClusterTopology {
        dry_run: *harmony::config::DRY_RUN,
        ..Default::default()
    };
-    let switch_client = BrocadeSwitchClient::init(
-        &switches,
-        &switch_auth.username,
-        &switch_auth.password,
-        brocade_options,
-    )
+    let switch_client = BrocadeSwitchClient::init(BrocadeSwitchConfig {
+        ips: switches,
+        auth: switch_auth,
+        options: brocade_options,
+    })
    .await
    .expect("Failed to connect to switch");

@@ -97,9 +99,3 @@ pub fn get_inventory() -> Inventory {
        control_plane_host: vec![],
    }
 }
-
-#[derive(Secret, Serialize, Deserialize, Debug)]
-pub struct BrocadeSwitchAuth {
-    pub username: String,
-    pub password: String,
-}
--- a/examples/openbao/src/main.rs
+++ b/examples/openbao/src/main.rs
@@ -1,59 +1,11 @@
-use std::str::FromStr;
-
 use harmony::{
-    inventory::Inventory,
-    modules::helm::chart::{HelmChartScore, HelmRepository, NonBlankString},
-    topology::K8sAnywhereTopology,
+    inventory::Inventory, modules::openbao::OpenbaoScore, topology::K8sAnywhereTopology,
 };
-use harmony_macros::hurl;

 #[tokio::main]
 async fn main() {
-    let values_yaml = Some(
-        r#"server:
-  standalone:
-    enabled: true
-    config: |
-      listener "tcp" {
-        tls_disable = true
-        address = "[::]:8200"
-        cluster_address = "[::]:8201"
-      }
-
-      storage "file" {
-        path = "/openbao/data"
-      }
-
-  service:
-    enabled: true
-
-  dataStorage:
-    enabled: true
-    size: 10Gi
-    storageClass: null
-    accessMode: ReadWriteOnce
-
-  auditStorage:
-    enabled: true
-    size: 10Gi
-    storageClass: null
-    accessMode: ReadWriteOnce"#
-            .to_string(),
-    );
-    let openbao = HelmChartScore {
-        namespace: Some(NonBlankString::from_str("openbao").unwrap()),
-        release_name: NonBlankString::from_str("openbao").unwrap(),
-        chart_name: NonBlankString::from_str("openbao/openbao").unwrap(),
-        chart_version: None,
-        values_overrides: None,
-        values_yaml,
-        create_namespace: true,
-        install_only: true,
-        repository: Some(HelmRepository::new(
-            "openbao".to_string(),
-            hurl!("https://openbao.github.io/openbao-helm"),
-            true,
-        )),
+    let openbao = OpenbaoScore {
+        host: "openbao.sebastien.sto1.nationtech.io".to_string(),
    };

    harmony_cli::run(
--- a/examples/operatorhub_catalog/src/main.rs
+++ b/examples/operatorhub_catalog/src/main.rs
@@ -1,5 +1,3 @@
-use std::str::FromStr;
-
 use harmony::{
    inventory::Inventory,
    modules::{k8s::apps::OperatorHubCatalogSourceScore, postgresql::CloudNativePgOperatorScore},
@@ -9,7 +7,7 @@ use harmony::{
 #[tokio::main]
 async fn main() {
    let operatorhub_catalog = OperatorHubCatalogSourceScore::default();
-    let cnpg_operator = CloudNativePgOperatorScore::default();
+    let cnpg_operator = CloudNativePgOperatorScore::default_openshift();

    harmony_cli::run(
        Inventory::autoload(),
--- a/examples/opnsense/Cargo.toml
+++ b/examples/opnsense/Cargo.toml
@@ -19,3 +19,4 @@ url = { workspace = true }
 harmony_secret = { path = "../../harmony_secret" }
 brocade = { path = "../../brocade" }
 serde = { workspace = true }
+schemars = "0.8"
--- a/examples/opnsense/src/main.rs
+++ b/examples/opnsense/src/main.rs
@@ -7,6 +7,7 @@ use harmony::{
 };
 use harmony_macros::{ip, ipv4};
 use harmony_secret::{Secret, SecretManager};
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};

 #[tokio::main]
@@ -70,7 +71,7 @@ async fn main() {
    .unwrap();
 }

-#[derive(Secret, Serialize, Deserialize, Debug)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug)]
 pub struct BrocadeSwitchAuth {
    pub username: String,
    pub password: String,
--- a/examples/opnsense_node_exporter/src/main.rs
+++ b/examples/opnsense_node_exporter/src/main.rs
@@ -1,22 +1,13 @@
-use std::{
-    net::{IpAddr, Ipv4Addr},
-    sync::Arc,
-};
+use std::sync::Arc;

 use async_trait::async_trait;
-use cidr::Ipv4Cidr;
 use harmony::{
    executors::ExecutorError,
-    hardware::{HostCategory, Location, PhysicalHost, SwitchGroup},
-    infra::opnsense::OPNSenseManagementInterface,
    inventory::Inventory,
    modules::opnsense::node_exporter::NodeExporterScore,
-    topology::{
-        HAClusterTopology, LogicalHost, PreparationError, PreparationOutcome, Topology,
-        UnmanagedRouter, node_exporter::NodeExporter,
-    },
+    topology::{PreparationError, PreparationOutcome, Topology, node_exporter::NodeExporter},
 };
-use harmony_macros::{ip, ipv4, mac_address};
+use harmony_macros::ip;

 #[derive(Debug)]
 struct OpnSenseTopology {
--- a/examples/public_postgres/src/main.rs
+++ b/examples/public_postgres/src/main.rs
@@ -1,8 +1,7 @@
 use harmony::{
    inventory::Inventory,
    modules::postgresql::{
-        K8sPostgreSQLScore, PostgreSQLConnectionScore, PublicPostgreSQLScore,
-        capability::PostgreSQLConfig,
+        PostgreSQLConnectionScore, PublicPostgreSQLScore, capability::PostgreSQLConfig,
    },
    topology::K8sAnywhereTopology,
 };
--- a/examples/rhob_application_monitoring/src/main.rs
+++ b/examples/rhob_application_monitoring/src/main.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, path::PathBuf, sync::Arc};
+use std::{path::PathBuf, sync::Arc};

 use harmony::{
    inventory::Inventory,
@@ -6,9 +6,9 @@ use harmony::{
        application::{
            ApplicationScore, RustWebFramework, RustWebapp, features::rhob_monitoring::Monitoring,
        },
-        monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
+        monitoring::alert_channel::discord_alert_channel::DiscordReceiver,
    },
-    topology::K8sAnywhereTopology,
+    topology::{K8sAnywhereTopology, monitoring::AlertRoute},
 };
 use harmony_types::{k8s_name::K8sName, net::Url};

@@ -22,18 +22,21 @@ async fn main() {
        service_port: 3000,
    });

-    let discord_receiver = DiscordWebhook {
-        name: K8sName("test-discord".to_string()),
+    let receiver_name = "test-discord".to_string();
+    let discord_receiver = DiscordReceiver {
+        name: receiver_name.clone(),
        url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
-        selectors: vec![],
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

    let app = ApplicationScore {
        features: vec![
-            Box::new(Monitoring {
-                application: application.clone(),
-                alert_receiver: vec![Box::new(discord_receiver)],
-            }),
+            // Box::new(Monitoring {
+            //     application: application.clone(),
+            //     alert_receiver: vec![Box::new(discord_receiver)],
+            // }),
            // TODO add backups, multisite ha, etc
        ],
        application,
--- a/examples/rust/src/main.rs
+++ b/examples/rust/src/main.rs
@@ -1,4 +1,4 @@
-use std::{collections::HashMap, path::PathBuf, sync::Arc};
+use std::{path::PathBuf, sync::Arc};

 use harmony::{
    inventory::Inventory,
@@ -8,13 +8,13 @@ use harmony::{
            features::{Monitoring, PackagingDeployment},
        },
        monitoring::alert_channel::{
-            discord_alert_channel::DiscordWebhook, webhook_receiver::WebhookReceiver,
+            discord_alert_channel::DiscordReceiver, webhook_receiver::WebhookReceiver,
        },
    },
-    topology::K8sAnywhereTopology,
+    topology::{K8sAnywhereTopology, monitoring::AlertRoute},
 };
 use harmony_macros::hurl;
-use harmony_types::k8s_name::K8sName;
+use harmony_types::{k8s_name::K8sName, net::Url};

 #[tokio::main]
 async fn main() {
@@ -26,15 +26,23 @@ async fn main() {
        service_port: 3000,
    });

-    let discord_receiver = DiscordWebhook {
-        name: K8sName("test-discord".to_string()),
-        url: hurl!("https://discord.doesnt.exist.com"),
-        selectors: vec![],
+    let receiver_name = "test-discord".to_string();
+    let discord_receiver = DiscordReceiver {
+        name: receiver_name.clone(),
+        url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

+    let receiver_name = "sample-webhook-receiver".to_string();
+
    let webhook_receiver = WebhookReceiver {
-        name: "sample-webhook-receiver".to_string(),
+        name: receiver_name.clone(),
        url: hurl!("https://webhook-doesnt-exist.com"),
+        route: AlertRoute {
+            ..AlertRoute::default(receiver_name)
+        },
    };

    let app = ApplicationScore {
@@ -42,10 +50,10 @@ async fn main() {
            Box::new(PackagingDeployment {
                application: application.clone(),
            }),
-            Box::new(Monitoring {
-                application: application.clone(),
-                alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
-            }),
+            // Box::new(Monitoring {
+            //     application: application.clone(),
+            //     alert_receiver: vec![Box::new(discord_receiver), Box::new(webhook_receiver)],
+            // }),
            // TODO add backups, multisite ha, etc
        ],
        application,
--- a/examples/sttest/Cargo.toml
+++ b/examples/sttest/Cargo.toml
@@ -20,3 +20,4 @@ env_logger = { workspace = true }
 url = { workspace = true }
 serde = { workspace = true }
 brocade = { path = "../../brocade" }
+schemars = "0.8"
--- a/examples/sttest/src/topology.rs
+++ b/examples/sttest/src/topology.rs
@@ -7,13 +7,14 @@ use harmony::{
 };
 use harmony_macros::{ip, ipv4};
 use harmony_secret::{Secret, SecretManager};
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::{
    net::IpAddr,
    sync::{Arc, OnceLock},
 };

-#[derive(Secret, Serialize, Deserialize, Debug, PartialEq)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
 struct OPNSenseFirewallConfig {
    username: String,
    password: String,
--- a/examples/try_rust_webapp/Cargo.toml
+++ b/examples/try_rust_webapp/Cargo.toml
@@ -5,6 +5,10 @@ version.workspace = true
 readme.workspace = true
 license.workspace = true

+[[example]]
+name = "try_rust_webapp"
+path = "src/main.rs"
+
 [dependencies]
 harmony = { path = "../../harmony" }
 harmony_cli = { path = "../../harmony_cli" }
--- a/examples/try_rust_webapp/src/main.rs
+++ b/examples/try_rust_webapp/src/main.rs
@@ -1,11 +1,8 @@
 use harmony::{
    inventory::Inventory,
-    modules::{
-        application::{
-            ApplicationScore, RustWebFramework, RustWebapp,
-            features::{Monitoring, PackagingDeployment},
-        },
-        monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
+    modules::application::{
+        ApplicationScore, RustWebFramework, RustWebapp,
+        features::{Monitoring, PackagingDeployment},
    },
    topology::K8sAnywhereTopology,
 };
@@ -30,14 +27,14 @@ async fn main() {
            Box::new(PackagingDeployment {
                application: application.clone(),
            }),
-            Box::new(Monitoring {
-                application: application.clone(),
-                alert_receiver: vec![Box::new(DiscordWebhook {
-                    name: K8sName("test-discord".to_string()),
-                    url: hurl!("https://discord.doesnt.exist.com"),
-                    selectors: vec![],
-                })],
-            }),
+            // Box::new(Monitoring {
+            //     application: application.clone(),
+            //     alert_receiver: vec![Box::new(DiscordWebhook {
+            //         name: K8sName("test-discord".to_string()),
+            //         url: hurl!("https://discord.doesnt.exist.com"),
+            //         selectors: vec![],
+            //     })],
+            // }),
        ],
        application,
    };
--- a/examples/zitadel/Cargo.toml
+++ b/examples/zitadel/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "example-zitadel"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_macros = { path = "../../harmony_macros" }
+harmony_types = { path = "../../harmony_types" }
+tokio.workspace = true
+url.workspace = true
--- a/examples/zitadel/src/main.rs
+++ b/examples/zitadel/src/main.rs
@@ -0,0 +1,20 @@
+use harmony::{
+    inventory::Inventory, modules::zitadel::ZitadelScore, topology::K8sAnywhereTopology,
+};
+
+#[tokio::main]
+async fn main() {
+    let zitadel = ZitadelScore {
+        host: "sso.sto1.nationtech.io".to_string(),
+        zitadel_version: "v4.12.1".to_string(),
+    };
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        K8sAnywhereTopology::from_env(),
+        vec![Box::new(zitadel)],
+        None,
+    )
+    .await
+    .unwrap();
+}
--- a/examples/zitadel/zitadel-9.24.0.tgz
+++ b/examples/zitadel/zitadel-9.24.0.tgz
--- a/harmony-k8s/Cargo.toml
+++ b/harmony-k8s/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "harmony-k8s"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+kube.workspace = true
+k8s-openapi.workspace = true
+tokio.workspace = true
+tokio-retry.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+serde_yaml.workspace = true
+log.workspace = true
+similar.workspace = true
+reqwest.workspace = true
+url.workspace = true
+inquire.workspace = true
+
+[dev-dependencies]
+pretty_assertions.workspace = true
--- a/harmony-k8s/src/apply.rs
+++ b/harmony-k8s/src/apply.rs
@@ -0,0 +1,593 @@
+use kube::{
+    Client, Error, Resource,
+    api::{
+        Api, ApiResource, DynamicObject, GroupVersionKind, Patch, PatchParams, PostParams,
+        ResourceExt,
+    },
+    core::ErrorResponse,
+    discovery::Scope,
+    error::DiscoveryError,
+};
+use log::{debug, error, trace, warn};
+use serde::{Serialize, de::DeserializeOwned};
+use serde_json::Value;
+use similar::TextDiff;
+use url::Url;
+
+use crate::client::K8sClient;
+use crate::helper;
+use crate::types::WriteMode;
+
+/// The field-manager token sent with every server-side apply request.
+pub const FIELD_MANAGER: &str = "harmony-k8s";
+
+// ── Private helpers ──────────────────────────────────────────────────────────
+
+/// Serialise any `Serialize` payload to a [`DynamicObject`] via JSON.
+fn to_dynamic<T: Serialize>(payload: &T) -> Result<DynamicObject, Error> {
+    serde_json::from_value(serde_json::to_value(payload).map_err(Error::SerdeError)?)
+        .map_err(Error::SerdeError)
+}
+
+/// Fetch the current resource, display a unified diff against `payload`, and
+/// return `()`.  All output goes to stdout (same behaviour as before).
+///
+/// A 404 is treated as "resource would be created" — not an error.
+async fn show_dry_run<T: Serialize>(
+    api: &Api<DynamicObject>,
+    name: &str,
+    payload: &T,
+) -> Result<(), Error> {
+    let new_yaml = serde_yaml::to_string(payload)
+        .unwrap_or_else(|_| "Failed to serialize new resource".to_string());
+
+    match api.get(name).await {
+        Ok(current) => {
+            println!("\nDry-run for resource: '{name}'");
+            let mut current_val = serde_yaml::to_value(&current).unwrap_or(serde_yaml::Value::Null);
+            if let Some(map) = current_val.as_mapping_mut() {
+                map.remove(&serde_yaml::Value::String("status".to_string()));
+            }
+            let current_yaml = serde_yaml::to_string(&current_val)
+                .unwrap_or_else(|_| "Failed to serialize current resource".to_string());
+
+            if current_yaml == new_yaml {
+                println!("No changes detected.");
+            } else {
+                println!("Changes detected:");
+                let diff = TextDiff::from_lines(&current_yaml, &new_yaml);
+                for change in diff.iter_all_changes() {
+                    let sign = match change.tag() {
+                        similar::ChangeTag::Delete => "-",
+                        similar::ChangeTag::Insert => "+",
+                        similar::ChangeTag::Equal => " ",
+                    };
+                    print!("{sign}{change}");
+                }
+            }
+            Ok(())
+        }
+        Err(Error::Api(ErrorResponse { code: 404, .. })) => {
+            println!("\nDry-run for new resource: '{name}'");
+            println!("Resource does not exist. Would be created:");
+            for line in new_yaml.lines() {
+                println!("+{line}");
+            }
+            Ok(())
+        }
+        Err(e) => {
+            error!("Failed to fetch resource '{name}' for dry-run: {e}");
+            Err(e)
+        }
+    }
+}
+
+/// Execute the real (non-dry-run) apply, respecting [`WriteMode`].
+async fn do_apply<T: Serialize + std::fmt::Debug>(
+    api: &Api<DynamicObject>,
+    name: &str,
+    payload: &T,
+    patch_params: &PatchParams,
+    write_mode: &WriteMode,
+) -> Result<DynamicObject, Error> {
+    match write_mode {
+        WriteMode::CreateOrUpdate => {
+            // TODO refactor this arm to perform self.update and if fail with 404 self.create
+            // This will avoid the repetition of the api.patch and api.create calls within this
+            // function body. This makes the code more maintainable
+            match api.patch(name, patch_params, &Patch::Apply(payload)).await {
+                Ok(obj) => Ok(obj),
+                Err(Error::Api(ErrorResponse { code: 404, .. })) => {
+                    debug!("Resource '{name}' not found via SSA, falling back to POST");
+                    let dyn_obj = to_dynamic(payload)?;
+                    api.create(&PostParams::default(), &dyn_obj)
+                        .await
+                        .map_err(|e| {
+                            error!("Failed to create '{name}': {e}");
+                            e
+                        })
+                }
+                Err(e) => {
+                    error!("Failed to apply '{name}': {e}");
+                    Err(e)
+                }
+            }
+        }
+        WriteMode::Create => {
+            let dyn_obj = to_dynamic(payload)?;
+            api.create(&PostParams::default(), &dyn_obj)
+                .await
+                .map_err(|e| {
+                    error!("Failed to create '{name}': {e}");
+                    e
+                })
+        }
+        WriteMode::Update => match api.patch(name, patch_params, &Patch::Apply(payload)).await {
+            Ok(obj) => Ok(obj),
+            Err(Error::Api(ErrorResponse { code: 404, .. })) => Err(Error::Api(ErrorResponse {
+                code: 404,
+                message: format!("Resource '{name}' not found and WriteMode is UpdateOnly"),
+                reason: "NotFound".to_string(),
+                status: "Failure".to_string(),
+            })),
+            Err(e) => {
+                error!("Failed to update '{name}': {e}");
+                Err(e)
+            }
+        },
+    }
+}
+
+// ── Public API ───────────────────────────────────────────────────────────────
+
+impl K8sClient {
+    /// Server-side apply: create if absent, update if present.
+    /// Equivalent to `kubectl apply`.
+    pub async fn apply<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        self.apply_with_strategy(resource, namespace, WriteMode::CreateOrUpdate)
+            .await
+    }
+
+    /// POST only — returns an error if the resource already exists.
+    pub async fn create<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        self.apply_with_strategy(resource, namespace, WriteMode::Create)
+            .await
+    }
+
+    /// Server-side apply only — returns an error if the resource does not exist.
+    pub async fn update<K>(&self, resource: &K, namespace: Option<&str>) -> Result<K, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        self.apply_with_strategy(resource, namespace, WriteMode::Update)
+            .await
+    }
+
+    pub async fn apply_with_strategy<K>(
+        &self,
+        resource: &K,
+        namespace: Option<&str>,
+        write_mode: WriteMode,
+    ) -> Result<K, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        debug!(
+            "apply_with_strategy: {:?} ns={:?}",
+            resource.meta().name,
+            namespace
+        );
+        trace!("{:#}", serde_json::to_value(resource).unwrap_or_default());
+
+        let dyntype = K::DynamicType::default();
+        let gvk = GroupVersionKind {
+            group: K::group(&dyntype).to_string(),
+            version: K::version(&dyntype).to_string(),
+            kind: K::kind(&dyntype).to_string(),
+        };
+
+        let discovery = self.discovery().await?;
+        let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
+            Error::Discovery(DiscoveryError::MissingResource(format!(
+                "Cannot resolve GVK: {gvk:?}"
+            )))
+        })?;
+
+        let effective_ns = if caps.scope == Scope::Cluster {
+            None
+        } else {
+            namespace.or_else(|| resource.meta().namespace.as_deref())
+        };
+
+        let api: Api<DynamicObject> =
+            get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
+
+        let name = resource
+            .meta()
+            .name
+            .as_deref()
+            .expect("Kubernetes resource must have a name");
+
+        if self.dry_run {
+            show_dry_run(&api, name, resource).await?;
+            return Ok(resource.clone());
+        }
+
+        let patch_params = PatchParams::apply(FIELD_MANAGER);
+        do_apply(&api, name, resource, &patch_params, &write_mode)
+            .await
+            .and_then(helper::dyn_to_typed)
+    }
+
+    /// Applies resources in order, one at a time
+    pub async fn apply_many<K>(&self, resources: &[K], ns: Option<&str>) -> Result<Vec<K>, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        let mut result = Vec::new();
+        for r in resources.iter() {
+            let res = self.apply(r, ns).await;
+            if res.is_err() {
+                // NOTE: this may log sensitive data; downgrade to debug if needed.
+                warn!(
+                    "Failed to apply k8s resource: {}",
+                    serde_json::to_string_pretty(r).map_err(Error::SerdeError)?
+                );
+            }
+            result.push(res?);
+        }
+        Ok(result)
+    }
+
+    /// Apply a [`DynamicObject`] resource using server-side apply.
+    pub async fn apply_dynamic(
+        &self,
+        resource: &DynamicObject,
+        namespace: Option<&str>,
+        force_conflicts: bool,
+    ) -> Result<DynamicObject, Error> {
+        trace!("apply_dynamic {resource:#?} ns={namespace:?} force={force_conflicts}");
+
+        let discovery = self.discovery().await?;
+        let type_meta = resource.types.as_ref().ok_or_else(|| {
+            Error::BuildRequest(kube::core::request::Error::Validation(
+                "DynamicObject must have types (apiVersion and kind)".to_string(),
+            ))
+        })?;
+
+        let gvk = GroupVersionKind::try_from(type_meta).map_err(|_| {
+            Error::BuildRequest(kube::core::request::Error::Validation(format!(
+                "Invalid GVK in DynamicObject: {type_meta:?}"
+            )))
+        })?;
+
+        let (ar, caps) = discovery.resolve_gvk(&gvk).ok_or_else(|| {
+            Error::Discovery(DiscoveryError::MissingResource(format!(
+                "Cannot resolve GVK: {gvk:?}"
+            )))
+        })?;
+
+        let effective_ns = if caps.scope == Scope::Cluster {
+            None
+        } else {
+            namespace.or_else(|| resource.metadata.namespace.as_deref())
+        };
+
+        let api = get_dynamic_api(ar, caps, self.client.clone(), effective_ns, false);
+        let name = resource.metadata.name.as_deref().ok_or_else(|| {
+            Error::BuildRequest(kube::core::request::Error::Validation(
+                "DynamicObject must have metadata.name".to_string(),
+            ))
+        })?;
+
+        debug!(
+            "apply_dynamic kind={:?} name='{name}' ns={effective_ns:?}",
+            resource.types.as_ref().map(|t| &t.kind),
+        );
+
+        // NOTE would be nice to improve cohesion between the dynamic and typed apis and avoid copy
+        // pasting the dry_run and some more logic
+        if self.dry_run {
+            show_dry_run(&api, name, resource).await?;
+            return Ok(resource.clone());
+        }
+
+        let mut patch_params = PatchParams::apply(FIELD_MANAGER);
+        patch_params.force = force_conflicts;
+
+        do_apply(
+            &api,
+            name,
+            resource,
+            &patch_params,
+            &WriteMode::CreateOrUpdate,
+        )
+        .await
+    }
+
+    pub async fn apply_dynamic_many(
+        &self,
+        resources: &[DynamicObject],
+        namespace: Option<&str>,
+        force_conflicts: bool,
+    ) -> Result<Vec<DynamicObject>, Error> {
+        let mut result = Vec::new();
+        for r in resources.iter() {
+            result.push(self.apply_dynamic(r, namespace, force_conflicts).await?);
+        }
+        Ok(result)
+    }
+
+    pub async fn apply_yaml_many(
+        &self,
+        #[allow(clippy::ptr_arg)] yaml: &Vec<serde_yaml::Value>,
+        ns: Option<&str>,
+    ) -> Result<(), Error> {
+        for y in yaml.iter() {
+            self.apply_yaml(y, ns).await?;
+        }
+        Ok(())
+    }
+
+    pub async fn apply_yaml(
+        &self,
+        yaml: &serde_yaml::Value,
+        ns: Option<&str>,
+    ) -> Result<(), Error> {
+        // NOTE wouldn't it be possible to parse this into a DynamicObject and simply call
+        // apply_dynamic instead of reimplementing api interactions?
+        let obj: DynamicObject =
+            serde_yaml::from_value(yaml.clone()).expect("YAML must deserialise to DynamicObject");
+        let name = obj.metadata.name.as_ref().expect("YAML must have a name");
+
+        let api_version = yaml["apiVersion"].as_str().expect("missing apiVersion");
+        let kind = yaml["kind"].as_str().expect("missing kind");
+
+        let mut it = api_version.splitn(2, '/');
+        let first = it.next().unwrap();
+        let (g, v) = match it.next() {
+            Some(second) => (first, second),
+            None => ("", first),
+        };
+
+        let api_resource = ApiResource::from_gvk(&GroupVersionKind::gvk(g, v, kind));
+        let namespace = ns.unwrap_or_else(|| {
+            obj.metadata
+                .namespace
+                .as_deref()
+                .expect("YAML must have a namespace when ns is not provided")
+        });
+
+        let api: Api<DynamicObject> =
+            Api::namespaced_with(self.client.clone(), namespace, &api_resource);
+
+        println!("Applying '{name}' in namespace '{namespace}'...");
+        let patch_params = PatchParams::apply(FIELD_MANAGER);
+        let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
+        println!("Successfully applied '{}'.", result.name_any());
+        Ok(())
+    }
+
+    /// Equivalent to `kubectl apply -f <url>`.
+    pub async fn apply_url(&self, url: Url, ns: Option<&str>) -> Result<(), Error> {
+        let patch_params = PatchParams::apply(FIELD_MANAGER);
+        let discovery = self.discovery().await?;
+
+        let yaml = reqwest::get(url)
+            .await
+            .expect("Could not fetch URL")
+            .text()
+            .await
+            .expect("Could not read response body");
+
+        for doc in multidoc_deserialize(&yaml).expect("Failed to parse YAML from URL") {
+            let obj: DynamicObject =
+                serde_yaml::from_value(doc).expect("YAML document is not a valid object");
+            let namespace = obj.metadata.namespace.as_deref().or(ns);
+            let type_meta = obj.types.as_ref().expect("Object is missing TypeMeta");
+            let gvk =
+                GroupVersionKind::try_from(type_meta).expect("Object has invalid GroupVersionKind");
+            let name = obj.name_any();
+
+            if let Some((ar, caps)) = discovery.resolve_gvk(&gvk) {
+                let api = get_dynamic_api(ar, caps, self.client.clone(), namespace, false);
+                trace!(
+                    "Applying {}:\n{}",
+                    gvk.kind,
+                    serde_yaml::to_string(&obj).unwrap_or_default()
+                );
+                let data: Value = serde_json::to_value(&obj).expect("serialisation failed");
+                let _r = api.patch(&name, &patch_params, &Patch::Apply(data)).await?;
+                debug!("Applied {} '{name}'", gvk.kind);
+            } else {
+                warn!("Skipping document with unknown GVK: {gvk:?}");
+            }
+        }
+        Ok(())
+    }
+
+    /// Build a dynamic API client from a [`DynamicObject`]'s type metadata.
+    pub(crate) fn get_api_for_dynamic_object(
+        &self,
+        object: &DynamicObject,
+        ns: Option<&str>,
+    ) -> Result<Api<DynamicObject>, Error> {
+        let ar = object
+            .types
+            .as_ref()
+            .and_then(|t| {
+                let parts: Vec<&str> = t.api_version.split('/').collect();
+                match parts.as_slice() {
+                    [version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
+                        "", version, &t.kind,
+                    ))),
+                    [group, version] => Some(ApiResource::from_gvk(&GroupVersionKind::gvk(
+                        group, version, &t.kind,
+                    ))),
+                    _ => None,
+                }
+            })
+            .ok_or_else(|| {
+                Error::BuildRequest(kube::core::request::Error::Validation(format!(
+                    "Invalid apiVersion in DynamicObject: {object:#?}"
+                )))
+            })?;
+
+        Ok(match ns {
+            Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
+            None => Api::default_namespaced_with(self.client.clone(), &ar),
+        })
+    }
+}
+
+// ── Free functions ───────────────────────────────────────────────────────────
+
+pub(crate) fn get_dynamic_api(
+    resource: kube::api::ApiResource,
+    capabilities: kube::discovery::ApiCapabilities,
+    client: Client,
+    ns: Option<&str>,
+    all: bool,
+) -> Api<DynamicObject> {
+    if capabilities.scope == Scope::Cluster || all {
+        Api::all_with(client, &resource)
+    } else if let Some(namespace) = ns {
+        Api::namespaced_with(client, namespace, &resource)
+    } else {
+        Api::default_namespaced_with(client, &resource)
+    }
+}
+
+pub(crate) fn multidoc_deserialize(
+    data: &str,
+) -> Result<Vec<serde_yaml::Value>, serde_yaml::Error> {
+    use serde::Deserialize;
+    let mut docs = vec![];
+    for de in serde_yaml::Deserializer::from_str(data) {
+        docs.push(serde_yaml::Value::deserialize(de)?);
+    }
+    Ok(docs)
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod apply_tests {
+    use std::collections::BTreeMap;
+    use std::time::{SystemTime, UNIX_EPOCH};
+
+    use k8s_openapi::api::core::v1::ConfigMap;
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+    use kube::api::{DeleteParams, TypeMeta};
+
+    use super::*;
+
+    #[tokio::test]
+    #[ignore = "requires kubernetes cluster"]
+    async fn apply_creates_new_configmap() {
+        let client = K8sClient::try_default().await.unwrap();
+        let ns = "default";
+        let name = format!(
+            "test-cm-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        );
+
+        let cm = ConfigMap {
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                namespace: Some(ns.to_string()),
+                ..Default::default()
+            },
+            data: Some(BTreeMap::from([("key1".to_string(), "value1".to_string())])),
+            ..Default::default()
+        };
+
+        assert!(client.apply(&cm, Some(ns)).await.is_ok());
+
+        let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
+        let _ = api.delete(&name, &DeleteParams::default()).await;
+    }
+
+    #[tokio::test]
+    #[ignore = "requires kubernetes cluster"]
+    async fn apply_is_idempotent() {
+        let client = K8sClient::try_default().await.unwrap();
+        let ns = "default";
+        let name = format!(
+            "test-idem-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        );
+
+        let cm = ConfigMap {
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                namespace: Some(ns.to_string()),
+                ..Default::default()
+            },
+            data: Some(BTreeMap::from([("key".to_string(), "value".to_string())])),
+            ..Default::default()
+        };
+
+        assert!(
+            client.apply(&cm, Some(ns)).await.is_ok(),
+            "first apply failed"
+        );
+        assert!(
+            client.apply(&cm, Some(ns)).await.is_ok(),
+            "second apply failed (not idempotent)"
+        );
+
+        let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
+        let _ = api.delete(&name, &DeleteParams::default()).await;
+    }
+
+    #[tokio::test]
+    #[ignore = "requires kubernetes cluster"]
+    async fn apply_dynamic_creates_new_resource() {
+        let client = K8sClient::try_default().await.unwrap();
+        let ns = "default";
+        let name = format!(
+            "test-dyn-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        );
+
+        let obj = DynamicObject {
+            types: Some(TypeMeta {
+                api_version: "v1".to_string(),
+                kind: "ConfigMap".to_string(),
+            }),
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                namespace: Some(ns.to_string()),
+                ..Default::default()
+            },
+            data: serde_json::json!({}),
+        };
+
+        let result = client.apply_dynamic(&obj, Some(ns), false).await;
+        assert!(result.is_ok(), "apply_dynamic failed: {:?}", result.err());
+
+        let api: Api<ConfigMap> = Api::namespaced(client.client.clone(), ns);
+        let _ = api.delete(&name, &DeleteParams::default()).await;
+    }
+}
--- a/harmony-k8s/src/bundle.rs
+++ b/harmony-k8s/src/bundle.rs
@@ -0,0 +1,133 @@
+//! Resource Bundle Pattern Implementation
+//!
+//! This module implements the Resource Bundle pattern for managing groups of
+//! Kubernetes resources that form a logical unit of work.
+//!
+//! ## Purpose
+//!
+//! The ResourceBundle pattern addresses the need to manage ephemeral privileged
+//! pods along with their platform-specific security requirements (e.g., OpenShift
+//! Security Context Constraints).
+//!
+//! ## Use Cases
+//!
+//! - Writing files to node filesystems (e.g., NetworkManager configurations for
+//!   network bonding as described in ADR-019)
+//! - Running privileged commands on nodes (e.g., reboots, system configuration)
+//!
+//! ## Benefits
+//!
+//! - **Separation of Concerns**: Client code doesn't need to know about
+//!   platform-specific RBAC requirements
+//! - **Atomic Operations**: Resources are applied and deleted as a unit
+//! - **Clean Abstractions**: Privileged operations are encapsulated in bundles
+//!   rather than scattered throughout client methods
+//!
+//! ## Example
+//!
+//! ```
+//! use harmony_k8s::{K8sClient, helper};
+//! use harmony_k8s::KubernetesDistribution;
+//!
+//! async fn write_network_config(client: &K8sClient, node: &str) {
+//!     // Create a bundle with platform-specific RBAC
+//!     let bundle = helper::build_privileged_bundle(
+//!         helper::PrivilegedPodConfig {
+//!             name: "network-config".to_string(),
+//!             namespace: "default".to_string(),
+//!             node_name: node.to_string(),
+//!             // ... other config
+//!             ..Default::default()
+//!         },
+//!         &KubernetesDistribution::OpenshiftFamily,
+//!     );
+//!     
+//!     // Apply all resources (RBAC + Pod) atomically
+//!     bundle.apply(client).await.unwrap();
+//!     
+//!     // ... wait for completion ...
+//!     
+//!     // Cleanup all resources
+//!     bundle.delete(client).await.unwrap();
+//! }
+//! ```
+
+use kube::{Error, Resource, ResourceExt, api::DynamicObject};
+use serde::Serialize;
+use serde_json;
+
+use crate::K8sClient;
+
+/// A ResourceBundle represents a logical unit of work consisting of multiple
+/// Kubernetes resources that should be applied or deleted together.
+///
+/// This pattern is useful for managing ephemeral privileged pods along with
+/// their required RBAC bindings (e.g., OpenShift SCC bindings).
+#[derive(Debug)]
+pub struct ResourceBundle {
+    pub resources: Vec<DynamicObject>,
+}
+
+impl ResourceBundle {
+    pub fn new() -> Self {
+        Self {
+            resources: Vec::new(),
+        }
+    }
+
+    /// Add a Kubernetes resource to this bundle.
+    /// The resource is converted to a DynamicObject for generic handling.
+    pub fn add<K>(&mut self, resource: K)
+    where
+        K: Resource + Serialize,
+        <K as Resource>::DynamicType: Default,
+    {
+        // Convert the typed resource to JSON, then to DynamicObject
+        let json = serde_json::to_value(&resource).expect("Failed to serialize resource");
+        let mut obj: DynamicObject =
+            serde_json::from_value(json).expect("Failed to convert to DynamicObject");
+
+        // Ensure type metadata is set
+        if obj.types.is_none() {
+            let api_version = Default::default();
+            let kind = Default::default();
+            let gvk = K::api_version(&api_version);
+            let kind = K::kind(&kind);
+            obj.types = Some(kube::api::TypeMeta {
+                api_version: gvk.to_string(),
+                kind: kind.to_string(),
+            });
+        }
+
+        self.resources.push(obj);
+    }
+
+    /// Apply all resources in this bundle to the cluster.
+    /// Resources are applied in the order they were added.
+    pub async fn apply(&self, client: &K8sClient) -> Result<(), Error> {
+        for res in &self.resources {
+            let namespace = res.namespace();
+            client
+                .apply_dynamic(res, namespace.as_deref(), true)
+                .await?;
+        }
+        Ok(())
+    }
+
+    /// Delete all resources in this bundle from the cluster.
+    /// Resources are deleted in reverse order to respect dependencies.
+    pub async fn delete(&self, client: &K8sClient) -> Result<(), Error> {
+        // FIXME delete all in parallel and retry using kube::client::retry::RetryPolicy
+        for res in self.resources.iter().rev() {
+            let api = client.get_api_for_dynamic_object(res, res.namespace().as_deref())?;
+            let name = res.name_any();
+            // FIXME this swallows all errors. Swallowing a 404 is ok but other errors must be
+            // handled properly (such as retrying). A normal error case is when we delete a
+            // resource bundle with dependencies between various resources. Such as a pod with a
+            // dependency on a ClusterRoleBinding. Trying to delete the ClusterRoleBinding first
+            // is expected to fail
+            let _ = api.delete(&name, &kube::api::DeleteParams::default()).await;
+        }
+        Ok(())
+    }
+}
--- a/harmony-k8s/src/client.rs
+++ b/harmony-k8s/src/client.rs
@@ -0,0 +1,99 @@
+use std::sync::Arc;
+
+use kube::config::{KubeConfigOptions, Kubeconfig};
+use kube::{Client, Config, Discovery, Error};
+use log::error;
+use serde::Serialize;
+use tokio::sync::OnceCell;
+
+use crate::types::KubernetesDistribution;
+
+// TODO not cool, should use a proper configuration mechanism
+// cli arg, env var, config file
+fn read_dry_run_from_env() -> bool {
+    std::env::var("DRY_RUN")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
+#[derive(Clone)]
+pub struct K8sClient {
+    pub(crate) client: Client,
+    /// When `true` no mutation is sent to the API server; diffs are printed
+    /// to stdout instead. Initialised from the `DRY_RUN` environment variable.
+    pub(crate) dry_run: bool,
+    pub(crate) k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
+    pub(crate) discovery: Arc<OnceCell<Discovery>>,
+}
+
+impl Serialize for K8sClient {
+    fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        todo!("K8sClient serialization is not meaningful; remove this impl if unused")
+    }
+}
+
+impl std::fmt::Debug for K8sClient {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_fmt(format_args!(
+            "K8sClient {{ namespace: {}, dry_run: {} }}",
+            self.client.default_namespace(),
+            self.dry_run,
+        ))
+    }
+}
+
+impl K8sClient {
+    /// Create a client, reading `DRY_RUN` from the environment.
+    pub fn new(client: Client) -> Self {
+        Self {
+            dry_run: read_dry_run_from_env(),
+            client,
+            k8s_distribution: Arc::new(OnceCell::new()),
+            discovery: Arc::new(OnceCell::new()),
+        }
+    }
+
+    /// Create a client that always operates in dry-run mode, regardless of
+    /// the environment variable.
+    pub fn new_dry_run(client: Client) -> Self {
+        Self {
+            dry_run: true,
+            ..Self::new(client)
+        }
+    }
+
+    /// Returns `true` if this client is operating in dry-run mode.
+    pub fn is_dry_run(&self) -> bool {
+        self.dry_run
+    }
+
+    pub async fn try_default() -> Result<Self, Error> {
+        Ok(Self::new(Client::try_default().await?))
+    }
+
+    pub async fn from_kubeconfig(path: &str) -> Option<Self> {
+        Self::from_kubeconfig_with_opts(path, &KubeConfigOptions::default()).await
+    }
+
+    pub async fn from_kubeconfig_with_context(path: &str, context: Option<String>) -> Option<Self> {
+        let mut opts = KubeConfigOptions::default();
+        opts.context = context;
+        Self::from_kubeconfig_with_opts(path, &opts).await
+    }
+
+    pub async fn from_kubeconfig_with_opts(path: &str, opts: &KubeConfigOptions) -> Option<Self> {
+        let k = match Kubeconfig::read_from(path) {
+            Ok(k) => k,
+            Err(e) => {
+                error!("Failed to load kubeconfig from {path}: {e}");
+                return None;
+            }
+        };
+        Some(Self::new(
+            Client::try_from(Config::from_custom_kubeconfig(k, opts).await.unwrap()).unwrap(),
+        ))
+    }
+}
--- a/harmony-k8s/src/config.rs
+++ b/harmony-k8s/src/config.rs
@@ -0,0 +1 @@
+pub const PRIVILEGED_POD_IMAGE: &str = "hub.nationtech.io/redhat/ubi10:latest";
--- a/harmony-k8s/src/discovery.rs
+++ b/harmony-k8s/src/discovery.rs
@@ -0,0 +1,83 @@
+use std::time::Duration;
+
+use kube::{Discovery, Error};
+use log::{debug, error, info, trace, warn};
+use tokio::sync::Mutex;
+use tokio_retry::{Retry, strategy::ExponentialBackoff};
+
+use crate::client::K8sClient;
+use crate::types::KubernetesDistribution;
+
+impl K8sClient {
+    pub async fn get_apiserver_version(
+        &self,
+    ) -> Result<k8s_openapi::apimachinery::pkg::version::Info, Error> {
+        self.client.clone().apiserver_version().await
+    }
+
+    /// Runs (and caches) Kubernetes API discovery with exponential-backoff retries.
+    pub async fn discovery(&self) -> Result<&Discovery, Error> {
+        let retry_strategy = ExponentialBackoff::from_millis(1000)
+            .max_delay(Duration::from_secs(32))
+            .take(6);
+
+        let attempt = Mutex::new(0u32);
+        Retry::spawn(retry_strategy, || async {
+            let mut n = attempt.lock().await;
+            *n += 1;
+            match self
+                .discovery
+                .get_or_try_init(async || {
+                    debug!("Running Kubernetes API discovery (attempt {})", *n);
+                    let d = Discovery::new(self.client.clone()).run().await?;
+                    debug!("Kubernetes API discovery completed");
+                    Ok(d)
+                })
+                .await
+            {
+                Ok(d) => Ok(d),
+                Err(e) => {
+                    warn!("Kubernetes API discovery failed (attempt {}): {}", *n, e);
+                    Err(e)
+                }
+            }
+        })
+        .await
+        .map_err(|e| {
+            error!("Kubernetes API discovery failed after all retries: {}", e);
+            e
+        })
+    }
+
+    /// Detect which Kubernetes distribution is running. Result is cached for
+    /// the lifetime of the client.
+    pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, Error> {
+        self.k8s_distribution
+            .get_or_try_init(async || {
+                debug!("Detecting Kubernetes distribution");
+                let api_groups = self.client.list_api_groups().await?;
+                trace!("list_api_groups: {:?}", api_groups);
+
+                let version = self.get_apiserver_version().await?;
+
+                if api_groups
+                    .groups
+                    .iter()
+                    .any(|g| g.name == "project.openshift.io")
+                {
+                    info!("Detected distribution: OpenshiftFamily");
+                    return Ok(KubernetesDistribution::OpenshiftFamily);
+                }
+
+                if version.git_version.contains("k3s") {
+                    info!("Detected distribution: K3sFamily");
+                    return Ok(KubernetesDistribution::K3sFamily);
+                }
+
+                info!("Distribution not identified, using Default");
+                Ok(KubernetesDistribution::Default)
+            })
+            .await
+            .cloned()
+    }
+}
--- a/harmony-k8s/src/helper.rs
+++ b/harmony-k8s/src/helper.rs
@@ -0,0 +1,613 @@
+use std::collections::BTreeMap;
+use std::time::Duration;
+
+use crate::KubernetesDistribution;
+
+use super::bundle::ResourceBundle;
+use super::config::PRIVILEGED_POD_IMAGE;
+use k8s_openapi::api::core::v1::{
+    Container, HostPathVolumeSource, Pod, PodSpec, SecurityContext, Volume, VolumeMount,
+};
+use k8s_openapi::api::rbac::v1::{ClusterRoleBinding, RoleRef, Subject};
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+use kube::api::DynamicObject;
+use kube::error::DiscoveryError;
+use log::{debug, error, info, warn};
+use serde::de::DeserializeOwned;
+
+#[derive(Debug)]
+pub struct PrivilegedPodConfig {
+    pub name: String,
+    pub namespace: String,
+    pub node_name: String,
+    pub container_name: String,
+    pub command: Vec<String>,
+    pub volumes: Vec<Volume>,
+    pub volume_mounts: Vec<VolumeMount>,
+    pub host_pid: bool,
+    pub host_network: bool,
+}
+
+impl Default for PrivilegedPodConfig {
+    fn default() -> Self {
+        Self {
+            name: "privileged-pod".to_string(),
+            namespace: "harmony".to_string(),
+            node_name: "".to_string(),
+            container_name: "privileged-container".to_string(),
+            command: vec![],
+            volumes: vec![],
+            volume_mounts: vec![],
+            host_pid: false,
+            host_network: false,
+        }
+    }
+}
+
+pub fn build_privileged_pod(
+    config: PrivilegedPodConfig,
+    k8s_distribution: &KubernetesDistribution,
+) -> Pod {
+    let annotations = match k8s_distribution {
+        KubernetesDistribution::OpenshiftFamily => Some(BTreeMap::from([
+            ("openshift.io/scc".to_string(), "privileged".to_string()),
+            (
+                "openshift.io/required-scc".to_string(),
+                "privileged".to_string(),
+            ),
+        ])),
+        _ => None,
+    };
+
+    Pod {
+        metadata: ObjectMeta {
+            name: Some(config.name),
+            namespace: Some(config.namespace),
+            annotations,
+            ..Default::default()
+        },
+        spec: Some(PodSpec {
+            node_name: Some(config.node_name),
+            restart_policy: Some("Never".to_string()),
+            host_pid: Some(config.host_pid),
+            host_network: Some(config.host_network),
+            containers: vec![Container {
+                name: config.container_name,
+                image: Some(PRIVILEGED_POD_IMAGE.to_string()),
+                command: Some(config.command),
+                security_context: Some(SecurityContext {
+                    privileged: Some(true),
+                    ..Default::default()
+                }),
+                volume_mounts: Some(config.volume_mounts),
+                ..Default::default()
+            }],
+            volumes: Some(config.volumes),
+            ..Default::default()
+        }),
+        ..Default::default()
+    }
+}
+
+pub fn host_root_volume() -> (Volume, VolumeMount) {
+    (
+        Volume {
+            name: "host".to_string(),
+            host_path: Some(HostPathVolumeSource {
+                path: "/".to_string(),
+                ..Default::default()
+            }),
+            ..Default::default()
+        },
+        VolumeMount {
+            name: "host".to_string(),
+            mount_path: "/host".to_string(),
+            ..Default::default()
+        },
+    )
+}
+
+/// Build a ResourceBundle containing a privileged pod and any required RBAC.
+///
+/// This function implements the Resource Bundle pattern to encapsulate platform-specific
+/// security requirements for running privileged operations on nodes.
+///
+/// # Platform-Specific Behavior
+///
+/// - **OpenShift**: Creates a ClusterRoleBinding to grant the default ServiceAccount
+///   access to the `system:openshift:scc:privileged` ClusterRole, which allows the pod
+///   to use the privileged Security Context Constraint (SCC).
+/// - **Standard Kubernetes/K3s**: Only creates the Pod resource, as these distributions
+///   use standard PodSecurityPolicy or don't enforce additional security constraints.
+///
+/// # Arguments
+///
+/// * `config` - Configuration for the privileged pod (name, namespace, command, etc.)
+/// * `k8s_distribution` - The detected Kubernetes distribution to determine RBAC requirements
+///
+/// # Returns
+///
+/// A `ResourceBundle` containing 1-2 resources:
+/// - ClusterRoleBinding (OpenShift only)
+/// - Pod (all distributions)
+///
+/// # Example
+///
+/// ```
+/// use harmony_k8s::helper::{build_privileged_bundle, PrivilegedPodConfig};
+/// use harmony_k8s::KubernetesDistribution;
+/// let bundle = build_privileged_bundle(
+///     PrivilegedPodConfig {
+///         name: "network-setup".to_string(),
+///         namespace: "default".to_string(),
+///         node_name: "worker-01".to_string(),
+///         container_name: "setup".to_string(),
+///         command: vec!["nmcli".to_string(), "connection".to_string(), "reload".to_string()],
+///         ..Default::default()
+///     },
+///     &KubernetesDistribution::OpenshiftFamily,
+/// );
+/// // Bundle now contains ClusterRoleBinding + Pod
+/// ```
+pub fn build_privileged_bundle(
+    config: PrivilegedPodConfig,
+    k8s_distribution: &KubernetesDistribution,
+) -> ResourceBundle {
+    debug!(
+        "Building privileged bundle for config {config:#?} on distribution {k8s_distribution:?}"
+    );
+    let mut bundle = ResourceBundle::new();
+    let pod_name = config.name.clone();
+    let namespace = config.namespace.clone();
+
+    // 1. On OpenShift, create RBAC binding to privileged SCC
+    if let KubernetesDistribution::OpenshiftFamily = k8s_distribution {
+        // The default ServiceAccount needs to be bound to the privileged SCC
+        // via the system:openshift:scc:privileged ClusterRole
+        let crb = ClusterRoleBinding {
+            metadata: ObjectMeta {
+                name: Some(format!("{}-scc-binding", pod_name)),
+                ..Default::default()
+            },
+            role_ref: RoleRef {
+                api_group: "rbac.authorization.k8s.io".to_string(),
+                kind: "ClusterRole".to_string(),
+                name: "system:openshift:scc:privileged".to_string(),
+            },
+            subjects: Some(vec![Subject {
+                kind: "ServiceAccount".to_string(),
+                name: "default".to_string(),
+                namespace: Some(namespace.clone()),
+                api_group: None,
+                ..Default::default()
+            }]),
+        };
+        bundle.add(crb);
+    }
+
+    // 2. Build the privileged pod
+    let pod = build_privileged_pod(config, k8s_distribution);
+    bundle.add(pod);
+
+    bundle
+}
+
+/// Action to take when a drain operation times out.
+pub enum DrainTimeoutAction {
+    /// Accept the partial drain and continue
+    Accept,
+    /// Retry the drain for another timeout period
+    Retry,
+    /// Abort the drain operation
+    Abort,
+}
+
+/// Prompts the user to confirm acceptance of a partial drain.
+///
+/// Returns `Ok(true)` if the user confirms acceptance, `Ok(false)` if the user
+/// chooses to retry or abort, and `Err` if the prompt system fails entirely.
+pub fn prompt_drain_timeout_action(
+    node_name: &str,
+    pending_count: usize,
+    timeout_duration: Duration,
+) -> Result<DrainTimeoutAction, kube::Error> {
+    let prompt_msg = format!(
+        "Drain operation timed out on node '{}' with {} pod(s) remaining. What would you like to do?",
+        node_name, pending_count
+    );
+
+    loop {
+        let choices = vec![
+            "Accept drain failure (requires confirmation)".to_string(),
+            format!("Retry drain for another {:?}", timeout_duration),
+            "Abort operation".to_string(),
+        ];
+
+        let selection = inquire::Select::new(&prompt_msg, choices)
+            .with_help_message("Use arrow keys to navigate, Enter to select")
+            .prompt()
+            .map_err(|e| {
+                kube::Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Prompt failed: {}",
+                    e
+                )))
+            })?;
+
+        if selection.starts_with("Accept") {
+            // Require typed confirmation - retry until correct or user cancels
+            let required_confirmation = format!("yes-accept-drain:{}={}", node_name, pending_count);
+
+            let confirmation_prompt = format!(
+                "To accept this partial drain, type exactly: {}",
+                required_confirmation
+            );
+
+            match inquire::Text::new(&confirmation_prompt)
+                .with_help_message(&format!(
+                    "This action acknowledges {} pods will remain on the node",
+                    pending_count
+                ))
+                .prompt()
+            {
+                Ok(input) if input == required_confirmation => {
+                    warn!(
+                        "User accepted partial drain of node '{}' with {} pods remaining (confirmation: {})",
+                        node_name, pending_count, required_confirmation
+                    );
+                    return Ok(DrainTimeoutAction::Accept);
+                }
+                Ok(input) => {
+                    warn!(
+                        "Confirmation failed. Expected '{}', got '{}'. Please try again.",
+                        required_confirmation, input
+                    );
+                }
+                Err(e) => {
+                    // User cancelled (Ctrl+C) or prompt system failed
+                    error!("Confirmation prompt cancelled or failed: {}", e);
+                    return Ok(DrainTimeoutAction::Abort);
+                }
+            }
+        } else if selection.starts_with("Retry") {
+            info!(
+                "User chose to retry drain operation for another {:?}",
+                timeout_duration
+            );
+            return Ok(DrainTimeoutAction::Retry);
+        } else {
+            error!("Drain operation aborted by user");
+            return Ok(DrainTimeoutAction::Abort);
+        }
+    }
+}
+
+/// JSON round-trip: DynamicObject → K
+///
+/// Safe because the DynamicObject was produced by the apiserver from a
+/// payload that was originally serialized from K, so the schema is identical.
+pub(crate) fn dyn_to_typed<K: DeserializeOwned>(obj: DynamicObject) -> Result<K, kube::Error> {
+    serde_json::to_value(obj)
+        .and_then(serde_json::from_value)
+        .map_err(kube::Error::SerdeError)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn test_host_root_volume() {
+        let (volume, mount) = host_root_volume();
+
+        assert_eq!(volume.name, "host");
+        assert_eq!(volume.host_path.as_ref().unwrap().path, "/");
+
+        assert_eq!(mount.name, "host");
+        assert_eq!(mount.mount_path, "/host");
+    }
+
+    #[test]
+    fn test_build_privileged_pod_minimal() {
+        let pod = build_privileged_pod(
+            PrivilegedPodConfig {
+                name: "minimal-pod".to_string(),
+                namespace: "kube-system".to_string(),
+                node_name: "node-123".to_string(),
+                container_name: "debug-container".to_string(),
+                command: vec!["sleep".to_string(), "3600".to_string()],
+                ..Default::default()
+            },
+            &KubernetesDistribution::Default,
+        );
+
+        assert_eq!(pod.metadata.name, Some("minimal-pod".to_string()));
+        assert_eq!(pod.metadata.namespace, Some("kube-system".to_string()));
+
+        let spec = pod.spec.as_ref().expect("Pod spec should be present");
+        assert_eq!(spec.node_name, Some("node-123".to_string()));
+        assert_eq!(spec.restart_policy, Some("Never".to_string()));
+        assert_eq!(spec.host_pid, Some(false));
+        assert_eq!(spec.host_network, Some(false));
+
+        assert_eq!(spec.containers.len(), 1);
+        let container = &spec.containers[0];
+        assert_eq!(container.name, "debug-container");
+        assert_eq!(container.image, Some(PRIVILEGED_POD_IMAGE.to_string()));
+        assert_eq!(
+            container.command,
+            Some(vec!["sleep".to_string(), "3600".to_string()])
+        );
+
+        // Security context check
+        let sec_ctx = container
+            .security_context
+            .as_ref()
+            .expect("Security context missing");
+        assert_eq!(sec_ctx.privileged, Some(true));
+    }
+
+    #[test]
+    fn test_build_privileged_pod_with_volumes_and_host_access() {
+        let (host_vol, host_mount) = host_root_volume();
+
+        let pod = build_privileged_pod(
+            PrivilegedPodConfig {
+                name: "full-pod".to_string(),
+                namespace: "default".to_string(),
+                node_name: "node-1".to_string(),
+                container_name: "runner".to_string(),
+                command: vec!["/bin/sh".to_string()],
+                volumes: vec![host_vol.clone()],
+                volume_mounts: vec![host_mount.clone()],
+                host_pid: true,
+                host_network: true,
+            },
+            &KubernetesDistribution::Default,
+        );
+
+        let spec = pod.spec.as_ref().expect("Pod spec should be present");
+        assert_eq!(spec.host_pid, Some(true));
+        assert_eq!(spec.host_network, Some(true));
+
+        // Check volumes in Spec
+        let volumes = spec.volumes.as_ref().expect("Volumes should be present");
+        assert_eq!(volumes.len(), 1);
+        assert_eq!(volumes[0].name, "host");
+
+        // Check mounts in Container
+        let container = &spec.containers[0];
+        let mounts = container
+            .volume_mounts
+            .as_ref()
+            .expect("Mounts should be present");
+        assert_eq!(mounts.len(), 1);
+        assert_eq!(mounts[0].name, "host");
+        assert_eq!(mounts[0].mount_path, "/host");
+    }
+
+    #[test]
+    fn test_build_privileged_pod_structure_correctness() {
+        // This test validates that the construction logic puts things in the right places
+        // effectively validating the "template".
+
+        let custom_vol = Volume {
+            name: "custom-vol".to_string(),
+            ..Default::default()
+        };
+        let custom_mount = VolumeMount {
+            name: "custom-vol".to_string(),
+            mount_path: "/custom".to_string(),
+            ..Default::default()
+        };
+
+        let pod = build_privileged_pod(
+            PrivilegedPodConfig {
+                name: "structure-test".to_string(),
+                namespace: "test-ns".to_string(),
+                node_name: "test-node".to_string(),
+                container_name: "test-container".to_string(),
+                command: vec!["cmd".to_string()],
+                volumes: vec![custom_vol],
+                volume_mounts: vec![custom_mount],
+                ..Default::default()
+            },
+            &KubernetesDistribution::Default,
+        );
+
+        // Validate structure depth
+        let spec = pod.spec.as_ref().unwrap();
+
+        // 1. Spec level fields
+        assert!(spec.node_name.is_some());
+        assert!(spec.volumes.is_some());
+
+        // 2. Container level fields
+        let container = &spec.containers[0];
+        assert!(container.security_context.is_some());
+        assert!(container.volume_mounts.is_some());
+
+        // 3. Nested fields
+        assert!(
+            container
+                .security_context
+                .as_ref()
+                .unwrap()
+                .privileged
+                .unwrap()
+        );
+        assert_eq!(spec.volumes.as_ref().unwrap()[0].name, "custom-vol");
+        assert_eq!(
+            container.volume_mounts.as_ref().unwrap()[0].mount_path,
+            "/custom"
+        );
+    }
+
+    #[test]
+    fn test_build_privileged_bundle_default_distribution() {
+        let bundle = build_privileged_bundle(
+            PrivilegedPodConfig {
+                name: "test-bundle".to_string(),
+                namespace: "test-ns".to_string(),
+                node_name: "node-1".to_string(),
+                container_name: "test-container".to_string(),
+                command: vec!["echo".to_string(), "hello".to_string()],
+                ..Default::default()
+            },
+            &KubernetesDistribution::Default,
+        );
+
+        // For Default distribution, only the Pod should be in the bundle
+        assert_eq!(bundle.resources.len(), 1);
+
+        let pod_obj = &bundle.resources[0];
+        assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle"));
+        assert_eq!(pod_obj.metadata.namespace.as_deref(), Some("test-ns"));
+    }
+
+    #[test]
+    fn test_build_privileged_bundle_openshift_distribution() {
+        let bundle = build_privileged_bundle(
+            PrivilegedPodConfig {
+                name: "test-bundle-ocp".to_string(),
+                namespace: "test-ns".to_string(),
+                node_name: "node-1".to_string(),
+                container_name: "test-container".to_string(),
+                command: vec!["echo".to_string(), "hello".to_string()],
+                ..Default::default()
+            },
+            &KubernetesDistribution::OpenshiftFamily,
+        );
+
+        // For OpenShift, both ClusterRoleBinding and Pod should be in the bundle
+        assert_eq!(bundle.resources.len(), 2);
+
+        // First resource should be the ClusterRoleBinding
+        let crb_obj = &bundle.resources[0];
+        assert_eq!(
+            crb_obj.metadata.name.as_deref(),
+            Some("test-bundle-ocp-scc-binding")
+        );
+
+        // Verify it's targeting the privileged SCC
+        if let Some(role_ref) = crb_obj.data.get("roleRef") {
+            assert_eq!(
+                role_ref.get("name").and_then(|v| v.as_str()),
+                Some("system:openshift:scc:privileged")
+            );
+        }
+
+        // Second resource should be the Pod
+        let pod_obj = &bundle.resources[1];
+        assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle-ocp"));
+        assert_eq!(pod_obj.metadata.namespace.as_deref(), Some("test-ns"));
+    }
+
+    #[test]
+    fn test_build_privileged_bundle_k3s_distribution() {
+        let bundle = build_privileged_bundle(
+            PrivilegedPodConfig {
+                name: "test-bundle-k3s".to_string(),
+                namespace: "test-ns".to_string(),
+                node_name: "node-1".to_string(),
+                container_name: "test-container".to_string(),
+                command: vec!["echo".to_string(), "hello".to_string()],
+                ..Default::default()
+            },
+            &KubernetesDistribution::K3sFamily,
+        );
+
+        // For K3s, only the Pod should be in the bundle (no special SCC)
+        assert_eq!(bundle.resources.len(), 1);
+
+        let pod_obj = &bundle.resources[0];
+        assert_eq!(pod_obj.metadata.name.as_deref(), Some("test-bundle-k3s"));
+    }
+
+    #[test]
+    fn test_pod_yaml_rendering_expected() {
+        let pod = build_privileged_pod(
+            PrivilegedPodConfig {
+                name: "pod_name".to_string(),
+                namespace: "pod_namespace".to_string(),
+                node_name: "node name".to_string(),
+                container_name: "container name".to_string(),
+                command: vec!["command".to_string(), "argument".to_string()],
+                host_pid: true,
+                host_network: true,
+                ..Default::default()
+            },
+            &KubernetesDistribution::Default,
+        );
+
+        assert_eq!(
+            &serde_yaml::to_string(&pod).unwrap(),
+            "apiVersion: v1
+kind: Pod
+metadata:
+  name: pod_name
+  namespace: pod_namespace
+spec:
+  containers:
+  - command:
+    - command
+    - argument
+    image: hub.nationtech.io/redhat/ubi10:latest
+    name: container name
+    securityContext:
+      privileged: true
+    volumeMounts: []
+  hostNetwork: true
+  hostPID: true
+  nodeName: node name
+  restartPolicy: Never
+  volumes: []
+"
+        );
+    }
+
+    #[test]
+    fn test_pod_yaml_rendering_openshift() {
+        let pod = build_privileged_pod(
+            PrivilegedPodConfig {
+                name: "pod_name".to_string(),
+                namespace: "pod_namespace".to_string(),
+                node_name: "node name".to_string(),
+                container_name: "container name".to_string(),
+                command: vec!["command".to_string(), "argument".to_string()],
+                host_pid: true,
+                host_network: true,
+                ..Default::default()
+            },
+            &KubernetesDistribution::OpenshiftFamily,
+        );
+
+        assert_eq!(
+            &serde_yaml::to_string(&pod).unwrap(),
+            "apiVersion: v1
+kind: Pod
+metadata:
+  annotations:
+    openshift.io/required-scc: privileged
+    openshift.io/scc: privileged
+  name: pod_name
+  namespace: pod_namespace
+spec:
+  containers:
+  - command:
+    - command
+    - argument
+    image: hub.nationtech.io/redhat/ubi10:latest
+    name: container name
+    securityContext:
+      privileged: true
+    volumeMounts: []
+  hostNetwork: true
+  hostPID: true
+  nodeName: node name
+  restartPolicy: Never
+  volumes: []
+"
+        );
+    }
+}
--- a/harmony-k8s/src/lib.rs
+++ b/harmony-k8s/src/lib.rs
@@ -0,0 +1,13 @@
+pub mod apply;
+pub mod bundle;
+pub mod client;
+pub mod config;
+pub mod discovery;
+pub mod helper;
+pub mod node;
+pub mod pod;
+pub mod resources;
+pub mod types;
+
+pub use client::K8sClient;
+pub use types::{DrainOptions, KubernetesDistribution, NodeFile, ScopeResolver, WriteMode};
--- a/harmony-k8s/src/main.rs
+++ b/harmony-k8s/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}
--- a/harmony-k8s/src/node.rs
+++ b/harmony-k8s/src/node.rs
@@ -0,0 +1,722 @@
+use std::collections::BTreeMap;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+use k8s_openapi::api::core::v1::{
+    ConfigMap, ConfigMapVolumeSource, Node, Pod, Volume, VolumeMount,
+};
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+use kube::{
+    Error,
+    api::{Api, DeleteParams, EvictParams, ListParams, PostParams},
+    core::ErrorResponse,
+    error::DiscoveryError,
+};
+use log::{debug, error, info, warn};
+use tokio::time::sleep;
+
+use crate::client::K8sClient;
+use crate::helper::{self, PrivilegedPodConfig};
+use crate::types::{DrainOptions, NodeFile};
+
+impl K8sClient {
+    pub async fn cordon_node(&self, node_name: &str) -> Result<(), Error> {
+        Api::<Node>::all(self.client.clone())
+            .cordon(node_name)
+            .await?;
+        Ok(())
+    }
+
+    pub async fn uncordon_node(&self, node_name: &str) -> Result<(), Error> {
+        Api::<Node>::all(self.client.clone())
+            .uncordon(node_name)
+            .await?;
+        Ok(())
+    }
+
+    pub async fn wait_for_node_ready(&self, node_name: &str) -> Result<(), Error> {
+        self.wait_for_node_ready_with_timeout(node_name, Duration::from_secs(600))
+            .await
+    }
+
+    async fn wait_for_node_ready_with_timeout(
+        &self,
+        node_name: &str,
+        timeout: Duration,
+    ) -> Result<(), Error> {
+        let api: Api<Node> = Api::all(self.client.clone());
+        let start = tokio::time::Instant::now();
+        let poll = Duration::from_secs(5);
+        loop {
+            if start.elapsed() > timeout {
+                return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Node '{node_name}' did not become Ready within {timeout:?}"
+                ))));
+            }
+            match api.get(node_name).await {
+                Ok(node) => {
+                    if node
+                        .status
+                        .as_ref()
+                        .and_then(|s| s.conditions.as_ref())
+                        .map(|conds| {
+                            conds
+                                .iter()
+                                .any(|c| c.type_ == "Ready" && c.status == "True")
+                        })
+                        .unwrap_or(false)
+                    {
+                        debug!("Node '{node_name}' is Ready");
+                        return Ok(());
+                    }
+                }
+                Err(e) => debug!("Error polling node '{node_name}': {e}"),
+            }
+            sleep(poll).await;
+        }
+    }
+
+    async fn wait_for_node_not_ready(
+        &self,
+        node_name: &str,
+        timeout: Duration,
+    ) -> Result<(), Error> {
+        let api: Api<Node> = Api::all(self.client.clone());
+        let start = tokio::time::Instant::now();
+        let poll = Duration::from_secs(5);
+        loop {
+            if start.elapsed() > timeout {
+                return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Node '{node_name}' did not become NotReady within {timeout:?}"
+                ))));
+            }
+            match api.get(node_name).await {
+                Ok(node) => {
+                    let is_ready = node
+                        .status
+                        .as_ref()
+                        .and_then(|s| s.conditions.as_ref())
+                        .map(|conds| {
+                            conds
+                                .iter()
+                                .any(|c| c.type_ == "Ready" && c.status == "True")
+                        })
+                        .unwrap_or(false);
+                    if !is_ready {
+                        debug!("Node '{node_name}' is NotReady");
+                        return Ok(());
+                    }
+                }
+                Err(e) => debug!("Error polling node '{node_name}': {e}"),
+            }
+            sleep(poll).await;
+        }
+    }
+
+    async fn list_pods_on_node(&self, node_name: &str) -> Result<Vec<Pod>, Error> {
+        let api: Api<Pod> = Api::all(self.client.clone());
+        Ok(api
+            .list(&ListParams::default().fields(&format!("spec.nodeName={node_name}")))
+            .await?
+            .items)
+    }
+
+    fn is_mirror_pod(pod: &Pod) -> bool {
+        pod.metadata
+            .annotations
+            .as_ref()
+            .map(|a| a.contains_key("kubernetes.io/config.mirror"))
+            .unwrap_or(false)
+    }
+
+    fn is_daemonset_pod(pod: &Pod) -> bool {
+        pod.metadata
+            .owner_references
+            .as_ref()
+            .map(|refs| refs.iter().any(|r| r.kind == "DaemonSet"))
+            .unwrap_or(false)
+    }
+
+    fn has_emptydir_volume(pod: &Pod) -> bool {
+        pod.spec
+            .as_ref()
+            .and_then(|s| s.volumes.as_ref())
+            .map(|vols| vols.iter().any(|v| v.empty_dir.is_some()))
+            .unwrap_or(false)
+    }
+
+    fn is_completed_pod(pod: &Pod) -> bool {
+        pod.status
+            .as_ref()
+            .and_then(|s| s.phase.as_deref())
+            .map(|phase| phase == "Succeeded" || phase == "Failed")
+            .unwrap_or(false)
+    }
+
+    fn classify_pods_for_drain(
+        pods: &[Pod],
+        options: &DrainOptions,
+    ) -> Result<(Vec<Pod>, Vec<String>), String> {
+        let mut evictable = Vec::new();
+        let mut skipped = Vec::new();
+        let mut blocking = Vec::new();
+
+        for pod in pods {
+            let name = pod.metadata.name.as_deref().unwrap_or("<unknown>");
+            let ns = pod.metadata.namespace.as_deref().unwrap_or("<unknown>");
+            let qualified = format!("{ns}/{name}");
+
+            if Self::is_mirror_pod(pod) {
+                skipped.push(format!("{qualified} (mirror pod)"));
+                continue;
+            }
+            if Self::is_completed_pod(pod) {
+                skipped.push(format!("{qualified} (completed)"));
+                continue;
+            }
+            if Self::is_daemonset_pod(pod) {
+                if options.ignore_daemonsets {
+                    skipped.push(format!("{qualified} (DaemonSet-managed)"));
+                } else {
+                    blocking.push(format!(
+                        "{qualified} is managed by a DaemonSet (set ignore_daemonsets to skip)"
+                    ));
+                }
+                continue;
+            }
+            if Self::has_emptydir_volume(pod) && !options.delete_emptydir_data {
+                blocking.push(format!(
+                    "{qualified} uses emptyDir volumes (set delete_emptydir_data to allow eviction)"
+                ));
+                continue;
+            }
+            evictable.push(pod.clone());
+        }
+
+        if !blocking.is_empty() {
+            return Err(format!(
+                "Cannot drain node — the following pods block eviction:\n  - {}",
+                blocking.join("\n  - ")
+            ));
+        }
+        Ok((evictable, skipped))
+    }
+
+    async fn evict_pod(&self, pod: &Pod) -> Result<(), Error> {
+        let name = pod.metadata.name.as_deref().unwrap_or_default();
+        let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
+        debug!("Evicting pod {ns}/{name}");
+        Api::<Pod>::namespaced(self.client.clone(), ns)
+            .evict(name, &EvictParams::default())
+            .await
+            .map(|_| ())
+    }
+
+    /// Drains a node: cordon → classify → evict & wait.
+    pub async fn drain_node(&self, node_name: &str, options: &DrainOptions) -> Result<(), Error> {
+        debug!("Cordoning '{node_name}'");
+        self.cordon_node(node_name).await?;
+
+        let pods = self.list_pods_on_node(node_name).await?;
+        debug!("Found {} pod(s) on '{node_name}'", pods.len());
+
+        let (evictable, skipped) =
+            Self::classify_pods_for_drain(&pods, options).map_err(|msg| {
+                error!("{msg}");
+                Error::Discovery(DiscoveryError::MissingResource(msg))
+            })?;
+
+        for s in &skipped {
+            info!("Skipping pod: {s}");
+        }
+        if evictable.is_empty() {
+            info!("No pods to evict on '{node_name}'");
+            return Ok(());
+        }
+        info!("Evicting {} pod(s) from '{node_name}'", evictable.len());
+
+        let mut start = tokio::time::Instant::now();
+        let poll = Duration::from_secs(5);
+        let mut pending = evictable;
+
+        loop {
+            for pod in &pending {
+                match self.evict_pod(pod).await {
+                    Ok(()) => {}
+                    Err(Error::Api(ErrorResponse { code: 404, .. })) => {}
+                    Err(Error::Api(ErrorResponse { code: 429, .. })) => {
+                        warn!(
+                            "PDB blocked eviction of {}/{}; will retry",
+                            pod.metadata.namespace.as_deref().unwrap_or(""),
+                            pod.metadata.name.as_deref().unwrap_or("")
+                        );
+                    }
+                    Err(e) => {
+                        error!(
+                            "Failed to evict {}/{}: {e}",
+                            pod.metadata.namespace.as_deref().unwrap_or(""),
+                            pod.metadata.name.as_deref().unwrap_or("")
+                        );
+                        return Err(e);
+                    }
+                }
+            }
+
+            sleep(poll).await;
+
+            let mut still_present = Vec::new();
+            for pod in pending {
+                let ns = pod.metadata.namespace.as_deref().unwrap_or_default();
+                let name = pod.metadata.name.as_deref().unwrap_or_default();
+                match self.get_pod(name, Some(ns)).await? {
+                    Some(_) => still_present.push(pod),
+                    None => debug!("Pod {ns}/{name} evicted"),
+                }
+            }
+            pending = still_present;
+
+            if pending.is_empty() {
+                break;
+            }
+
+            if start.elapsed() > options.timeout {
+                match helper::prompt_drain_timeout_action(
+                    node_name,
+                    pending.len(),
+                    options.timeout,
+                )? {
+                    helper::DrainTimeoutAction::Accept => break,
+                    helper::DrainTimeoutAction::Retry => {
+                        start = tokio::time::Instant::now();
+                        continue;
+                    }
+                    helper::DrainTimeoutAction::Abort => {
+                        return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                            "Drain aborted. {} pod(s) remaining on '{node_name}'",
+                            pending.len()
+                        ))));
+                    }
+                }
+            }
+            debug!("Waiting for {} pod(s) on '{node_name}'", pending.len());
+        }
+
+        debug!("'{node_name}' drained successfully");
+        Ok(())
+    }
+
+    /// Safely reboots a node: drain → reboot → wait for Ready → uncordon.
+    pub async fn reboot_node(
+        &self,
+        node_name: &str,
+        drain_options: &DrainOptions,
+        timeout: Duration,
+    ) -> Result<(), Error> {
+        info!("Starting reboot for '{node_name}'");
+        let node_api: Api<Node> = Api::all(self.client.clone());
+
+        let boot_id_before = node_api
+            .get(node_name)
+            .await?
+            .status
+            .as_ref()
+            .and_then(|s| s.node_info.as_ref())
+            .map(|ni| ni.boot_id.clone())
+            .ok_or_else(|| {
+                Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Node '{node_name}' has no boot_id in status"
+                )))
+            })?;
+
+        info!("Draining '{node_name}'");
+        self.drain_node(node_name, drain_options).await?;
+
+        let start = tokio::time::Instant::now();
+
+        info!("Scheduling reboot for '{node_name}'");
+        let reboot_cmd =
+            "echo rebooting ; nohup bash -c 'sleep 5 && nsenter -t 1 -m -- systemctl reboot'";
+        match self
+            .run_privileged_command_on_node(node_name, reboot_cmd)
+            .await
+        {
+            Ok(_) => debug!("Reboot command dispatched"),
+            Err(e) => debug!("Reboot command error (expected if node began shutdown): {e}"),
+        }
+
+        info!("Waiting for '{node_name}' to begin shutdown");
+        self.wait_for_node_not_ready(node_name, timeout.saturating_sub(start.elapsed()))
+            .await?;
+
+        if start.elapsed() > timeout {
+            return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                "Timeout during reboot of '{node_name}' (shutdown phase)"
+            ))));
+        }
+
+        info!("Waiting for '{node_name}' to come back online");
+        self.wait_for_node_ready_with_timeout(node_name, timeout.saturating_sub(start.elapsed()))
+            .await?;
+
+        if start.elapsed() > timeout {
+            return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                "Timeout during reboot of '{node_name}' (ready phase)"
+            ))));
+        }
+
+        let boot_id_after = node_api
+            .get(node_name)
+            .await?
+            .status
+            .as_ref()
+            .and_then(|s| s.node_info.as_ref())
+            .map(|ni| ni.boot_id.clone())
+            .ok_or_else(|| {
+                Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Node '{node_name}' has no boot_id after reboot"
+                )))
+            })?;
+
+        if boot_id_before == boot_id_after {
+            return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                "Node '{node_name}' did not actually reboot (boot_id unchanged: {boot_id_before})"
+            ))));
+        }
+
+        info!("'{node_name}' rebooted ({boot_id_before} → {boot_id_after})");
+        self.uncordon_node(node_name).await?;
+        info!("'{node_name}' reboot complete ({:?})", start.elapsed());
+        Ok(())
+    }
+
+    /// Write a set of files to a node's filesystem via a privileged ephemeral pod.
+    pub async fn write_files_to_node(
+        &self,
+        node_name: &str,
+        files: &[NodeFile],
+    ) -> Result<String, Error> {
+        let ns = self.client.default_namespace();
+        let suffix = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+        let name = format!("harmony-k8s-writer-{suffix}");
+
+        debug!("Writing {} file(s) to '{node_name}'", files.len());
+
+        let mut data = BTreeMap::new();
+        let mut script = String::from("set -e\n");
+        for (i, file) in files.iter().enumerate() {
+            let key = format!("f{i}");
+            data.insert(key.clone(), file.content.clone());
+            script.push_str(&format!("mkdir -p \"$(dirname \"/host{}\")\"\n", file.path));
+            script.push_str(&format!("cp \"/payload/{key}\" \"/host{}\"\n", file.path));
+            script.push_str(&format!("chmod {:o} \"/host{}\"\n", file.mode, file.path));
+        }
+
+        let cm = ConfigMap {
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                namespace: Some(ns.to_string()),
+                ..Default::default()
+            },
+            data: Some(data),
+            ..Default::default()
+        };
+
+        let cm_api: Api<ConfigMap> = Api::namespaced(self.client.clone(), ns);
+        cm_api.create(&PostParams::default(), &cm).await?;
+        debug!("Created ConfigMap '{name}'");
+
+        let (host_vol, host_mount) = helper::host_root_volume();
+        let payload_vol = Volume {
+            name: "payload".to_string(),
+            config_map: Some(ConfigMapVolumeSource {
+                name: name.clone(),
+                ..Default::default()
+            }),
+            ..Default::default()
+        };
+        let payload_mount = VolumeMount {
+            name: "payload".to_string(),
+            mount_path: "/payload".to_string(),
+            ..Default::default()
+        };
+
+        let bundle = helper::build_privileged_bundle(
+            PrivilegedPodConfig {
+                name: name.clone(),
+                namespace: ns.to_string(),
+                node_name: node_name.to_string(),
+                container_name: "writer".to_string(),
+                command: vec!["/bin/bash".to_string(), "-c".to_string(), script],
+                volumes: vec![payload_vol, host_vol],
+                volume_mounts: vec![payload_mount, host_mount],
+                host_pid: false,
+                host_network: false,
+            },
+            &self.get_k8s_distribution().await?,
+        );
+
+        bundle.apply(self).await?;
+        debug!("Created privileged pod bundle '{name}'");
+
+        let result = self.wait_for_pod_completion(&name, ns).await;
+
+        debug!("Cleaning up '{name}'");
+        let _ = bundle.delete(self).await;
+        let _ = cm_api.delete(&name, &DeleteParams::default()).await;
+
+        result
+    }
+
+    /// Run a privileged command on a node via an ephemeral pod.
+    pub async fn run_privileged_command_on_node(
+        &self,
+        node_name: &str,
+        command: &str,
+    ) -> Result<String, Error> {
+        let namespace = self.client.default_namespace();
+        let suffix = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+        let name = format!("harmony-k8s-cmd-{suffix}");
+
+        debug!("Running privileged command on '{node_name}': {command}");
+
+        let (host_vol, host_mount) = helper::host_root_volume();
+        let bundle = helper::build_privileged_bundle(
+            PrivilegedPodConfig {
+                name: name.clone(),
+                namespace: namespace.to_string(),
+                node_name: node_name.to_string(),
+                container_name: "runner".to_string(),
+                command: vec![
+                    "/bin/bash".to_string(),
+                    "-c".to_string(),
+                    command.to_string(),
+                ],
+                volumes: vec![host_vol],
+                volume_mounts: vec![host_mount],
+                host_pid: true,
+                host_network: true,
+            },
+            &self.get_k8s_distribution().await?,
+        );
+
+        bundle.apply(self).await?;
+        debug!("Privileged pod '{name}' created");
+
+        let result = self.wait_for_pod_completion(&name, namespace).await;
+
+        debug!("Cleaning up '{name}'");
+        let _ = bundle.delete(self).await;
+
+        result
+    }
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use k8s_openapi::api::core::v1::{EmptyDirVolumeSource, PodSpec, PodStatus, Volume};
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
+
+    use super::*;
+
+    fn base_pod(name: &str, ns: &str) -> Pod {
+        Pod {
+            metadata: ObjectMeta {
+                name: Some(name.to_string()),
+                namespace: Some(ns.to_string()),
+                ..Default::default()
+            },
+            spec: Some(PodSpec::default()),
+            status: Some(PodStatus {
+                phase: Some("Running".to_string()),
+                ..Default::default()
+            }),
+        }
+    }
+
+    fn mirror_pod(name: &str, ns: &str) -> Pod {
+        let mut pod = base_pod(name, ns);
+        pod.metadata.annotations = Some(std::collections::BTreeMap::from([(
+            "kubernetes.io/config.mirror".to_string(),
+            "abc123".to_string(),
+        )]));
+        pod
+    }
+
+    fn daemonset_pod(name: &str, ns: &str) -> Pod {
+        let mut pod = base_pod(name, ns);
+        pod.metadata.owner_references = Some(vec![OwnerReference {
+            api_version: "apps/v1".to_string(),
+            kind: "DaemonSet".to_string(),
+            name: "some-ds".to_string(),
+            uid: "uid-ds".to_string(),
+            ..Default::default()
+        }]);
+        pod
+    }
+
+    fn emptydir_pod(name: &str, ns: &str) -> Pod {
+        let mut pod = base_pod(name, ns);
+        pod.spec = Some(PodSpec {
+            volumes: Some(vec![Volume {
+                name: "scratch".to_string(),
+                empty_dir: Some(EmptyDirVolumeSource::default()),
+                ..Default::default()
+            }]),
+            ..Default::default()
+        });
+        pod
+    }
+
+    fn completed_pod(name: &str, ns: &str, phase: &str) -> Pod {
+        let mut pod = base_pod(name, ns);
+        pod.status = Some(PodStatus {
+            phase: Some(phase.to_string()),
+            ..Default::default()
+        });
+        pod
+    }
+
+    fn default_opts() -> DrainOptions {
+        DrainOptions::default()
+    }
+
+    // All test bodies are identical to the original — only the module path changed.
+
+    #[test]
+    fn empty_pod_list_returns_empty_vecs() {
+        let (e, s) = K8sClient::classify_pods_for_drain(&[], &default_opts()).unwrap();
+        assert!(e.is_empty());
+        assert!(s.is_empty());
+    }
+
+    #[test]
+    fn normal_pod_is_evictable() {
+        let pods = vec![base_pod("web", "default")];
+        let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
+        assert_eq!(e.len(), 1);
+        assert!(s.is_empty());
+    }
+
+    #[test]
+    fn mirror_pod_is_skipped() {
+        let pods = vec![mirror_pod("kube-apiserver", "kube-system")];
+        let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
+        assert!(e.is_empty());
+        assert!(s[0].contains("mirror pod"));
+    }
+
+    #[test]
+    fn completed_pods_are_skipped() {
+        for phase in ["Succeeded", "Failed"] {
+            let pods = vec![completed_pod("job", "batch", phase)];
+            let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
+            assert!(e.is_empty());
+            assert!(s[0].contains("completed"));
+        }
+    }
+
+    #[test]
+    fn daemonset_skipped_when_ignored() {
+        let pods = vec![daemonset_pod("fluentd", "logging")];
+        let opts = DrainOptions {
+            ignore_daemonsets: true,
+            ..default_opts()
+        };
+        let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
+        assert!(e.is_empty());
+        assert!(s[0].contains("DaemonSet-managed"));
+    }
+
+    #[test]
+    fn daemonset_blocks_when_not_ignored() {
+        let pods = vec![daemonset_pod("fluentd", "logging")];
+        let opts = DrainOptions {
+            ignore_daemonsets: false,
+            ..default_opts()
+        };
+        let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
+        assert!(err.contains("DaemonSet") && err.contains("logging/fluentd"));
+    }
+
+    #[test]
+    fn emptydir_blocks_without_flag() {
+        let pods = vec![emptydir_pod("cache", "default")];
+        let opts = DrainOptions {
+            delete_emptydir_data: false,
+            ..default_opts()
+        };
+        let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
+        assert!(err.contains("emptyDir") && err.contains("default/cache"));
+    }
+
+    #[test]
+    fn emptydir_evictable_with_flag() {
+        let pods = vec![emptydir_pod("cache", "default")];
+        let opts = DrainOptions {
+            delete_emptydir_data: true,
+            ..default_opts()
+        };
+        let (e, s) = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap();
+        assert_eq!(e.len(), 1);
+        assert!(s.is_empty());
+    }
+
+    #[test]
+    fn multiple_blocking_all_reported() {
+        let pods = vec![daemonset_pod("ds", "ns1"), emptydir_pod("ed", "ns2")];
+        let opts = DrainOptions {
+            ignore_daemonsets: false,
+            delete_emptydir_data: false,
+            ..default_opts()
+        };
+        let err = K8sClient::classify_pods_for_drain(&pods, &opts).unwrap_err();
+        assert!(err.contains("ns1/ds") && err.contains("ns2/ed"));
+    }
+
+    #[test]
+    fn mixed_pods_classified_correctly() {
+        let pods = vec![
+            base_pod("web", "default"),
+            mirror_pod("kube-apiserver", "kube-system"),
+            daemonset_pod("fluentd", "logging"),
+            completed_pod("job", "batch", "Succeeded"),
+            base_pod("api", "default"),
+        ];
+        let (e, s) = K8sClient::classify_pods_for_drain(&pods, &default_opts()).unwrap();
+        let names: Vec<&str> = e
+            .iter()
+            .map(|p| p.metadata.name.as_deref().unwrap())
+            .collect();
+        assert_eq!(names, vec!["web", "api"]);
+        assert_eq!(s.len(), 3);
+    }
+
+    #[test]
+    fn mirror_checked_before_completed() {
+        let mut pod = mirror_pod("static-etcd", "kube-system");
+        pod.status = Some(PodStatus {
+            phase: Some("Succeeded".to_string()),
+            ..Default::default()
+        });
+        let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
+        assert!(s[0].contains("mirror pod"), "got: {}", s[0]);
+    }
+
+    #[test]
+    fn completed_checked_before_daemonset() {
+        let mut pod = daemonset_pod("collector", "monitoring");
+        pod.status = Some(PodStatus {
+            phase: Some("Failed".to_string()),
+            ..Default::default()
+        });
+        let (_, s) = K8sClient::classify_pods_for_drain(&[pod], &default_opts()).unwrap();
+        assert!(s[0].contains("completed"), "got: {}", s[0]);
+    }
+}
--- a/harmony-k8s/src/pod.rs
+++ b/harmony-k8s/src/pod.rs
@@ -0,0 +1,193 @@
+use std::time::Duration;
+
+use k8s_openapi::api::core::v1::Pod;
+use kube::{
+    Error,
+    api::{Api, AttachParams, ListParams},
+    error::DiscoveryError,
+    runtime::reflector::Lookup,
+};
+use log::debug;
+use tokio::io::AsyncReadExt;
+use tokio::time::sleep;
+
+use crate::client::K8sClient;
+
+impl K8sClient {
+    pub async fn get_pod(&self, name: &str, namespace: Option<&str>) -> Result<Option<Pod>, Error> {
+        let api: Api<Pod> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        api.get_opt(name).await
+    }
+
+    pub async fn wait_for_pod_ready(
+        &self,
+        pod_name: &str,
+        namespace: Option<&str>,
+    ) -> Result<(), Error> {
+        let mut elapsed = 0u64;
+        let interval = 5u64;
+        let timeout_secs = 120u64;
+        loop {
+            if let Some(p) = self.get_pod(pod_name, namespace).await? {
+                if let Some(phase) = p.status.and_then(|s| s.phase) {
+                    if phase.to_lowercase() == "running" {
+                        return Ok(());
+                    }
+                }
+            }
+            if elapsed >= timeout_secs {
+                return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                    "Pod '{}' in '{}' did not become ready within {timeout_secs}s",
+                    pod_name,
+                    namespace.unwrap_or("<default>"),
+                ))));
+            }
+            sleep(Duration::from_secs(interval)).await;
+            elapsed += interval;
+        }
+    }
+
+    /// Polls a pod until it reaches `Succeeded` or `Failed`, then returns its
+    /// logs.  Used internally by node operations.
+    pub(crate) async fn wait_for_pod_completion(
+        &self,
+        name: &str,
+        namespace: &str,
+    ) -> Result<String, Error> {
+        let api: Api<Pod> = Api::namespaced(self.client.clone(), namespace);
+        let poll_interval = Duration::from_secs(2);
+        for _ in 0..60 {
+            sleep(poll_interval).await;
+            let p = api.get(name).await?;
+            match p.status.and_then(|s| s.phase).as_deref() {
+                Some("Succeeded") => {
+                    let logs = api
+                        .logs(name, &Default::default())
+                        .await
+                        .unwrap_or_default();
+                    debug!("Pod {namespace}/{name} succeeded. Logs: {logs}");
+                    return Ok(logs);
+                }
+                Some("Failed") => {
+                    let logs = api
+                        .logs(name, &Default::default())
+                        .await
+                        .unwrap_or_default();
+                    debug!("Pod {namespace}/{name} failed. Logs: {logs}");
+                    return Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+                        "Pod '{name}' failed.\n{logs}"
+                    ))));
+                }
+                _ => {}
+            }
+        }
+        Err(Error::Discovery(DiscoveryError::MissingResource(format!(
+            "Timed out waiting for pod '{name}'"
+        ))))
+    }
+
+    /// Execute a command in the first pod matching `{label}={name}`.
+    pub async fn exec_app_capture_output(
+        &self,
+        name: String,
+        label: String,
+        namespace: Option<&str>,
+        command: Vec<&str>,
+    ) -> Result<String, String> {
+        let api: Api<Pod> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        let pod_list = api
+            .list(&ListParams::default().labels(&format!("{label}={name}")))
+            .await
+            .expect("Failed to list pods");
+
+        let pod_name = pod_list
+            .items
+            .first()
+            .expect("No matching pod")
+            .name()
+            .expect("Pod has no name")
+            .into_owned();
+
+        match api
+            .exec(
+                &pod_name,
+                command,
+                &AttachParams::default().stdout(true).stderr(true),
+            )
+            .await
+        {
+            Err(e) => Err(e.to_string()),
+            Ok(mut process) => {
+                let status = process
+                    .take_status()
+                    .expect("No status handle")
+                    .await
+                    .expect("Status channel closed");
+
+                if let Some(s) = status.status {
+                    let mut buf = String::new();
+                    if let Some(mut stdout) = process.stdout() {
+                        stdout
+                            .read_to_string(&mut buf)
+                            .await
+                            .map_err(|e| format!("Failed to read stdout: {e}"))?;
+                    }
+                    debug!("exec status: {} - {:?}", s, status.details);
+                    if s == "Success" { Ok(buf) } else { Err(s) }
+                } else {
+                    Err("No inner status from pod exec".to_string())
+                }
+            }
+        }
+    }
+
+    /// Execute a command in the first pod matching
+    /// `app.kubernetes.io/name={name}`.
+    pub async fn exec_app(
+        &self,
+        name: String,
+        namespace: Option<&str>,
+        command: Vec<&str>,
+    ) -> Result<(), String> {
+        let api: Api<Pod> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        let pod_list = api
+            .list(&ListParams::default().labels(&format!("app.kubernetes.io/name={name}")))
+            .await
+            .expect("Failed to list pods");
+
+        let pod_name = pod_list
+            .items
+            .first()
+            .expect("No matching pod")
+            .name()
+            .expect("Pod has no name")
+            .into_owned();
+
+        match api.exec(&pod_name, command, &AttachParams::default()).await {
+            Err(e) => Err(e.to_string()),
+            Ok(mut process) => {
+                let status = process
+                    .take_status()
+                    .expect("No status handle")
+                    .await
+                    .expect("Status channel closed");
+
+                if let Some(s) = status.status {
+                    debug!("exec status: {} - {:?}", s, status.details);
+                    if s == "Success" { Ok(()) } else { Err(s) }
+                } else {
+                    Err("No inner status from pod exec".to_string())
+                }
+            }
+        }
+    }
+}
--- a/harmony-k8s/src/resources.rs
+++ b/harmony-k8s/src/resources.rs
@@ -0,0 +1,316 @@
+use std::collections::HashMap;
+
+use k8s_openapi::api::{
+    apps::v1::Deployment,
+    core::v1::{Node, ServiceAccount},
+};
+use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition;
+use kube::api::ApiResource;
+use kube::{
+    Error, Resource,
+    api::{Api, DynamicObject, GroupVersionKind, ListParams, ObjectList},
+    runtime::conditions,
+    runtime::wait::await_condition,
+};
+use log::debug;
+use serde::de::DeserializeOwned;
+use serde_json::Value;
+use std::time::Duration;
+
+use crate::client::K8sClient;
+use crate::types::ScopeResolver;
+
+impl K8sClient {
+    pub async fn has_healthy_deployment_with_label(
+        &self,
+        namespace: &str,
+        label_selector: &str,
+    ) -> Result<bool, Error> {
+        let api: Api<Deployment> = Api::namespaced(self.client.clone(), namespace);
+        let list = api
+            .list(&ListParams::default().labels(label_selector))
+            .await?;
+        for d in list.items {
+            let available = d
+                .status
+                .as_ref()
+                .and_then(|s| s.available_replicas)
+                .unwrap_or(0);
+            if available > 0 {
+                return Ok(true);
+            }
+            if let Some(conds) = d.status.as_ref().and_then(|s| s.conditions.as_ref()) {
+                if conds
+                    .iter()
+                    .any(|c| c.type_ == "Available" && c.status == "True")
+                {
+                    return Ok(true);
+                }
+            }
+        }
+        Ok(false)
+    }
+
+    pub async fn list_namespaces_with_healthy_deployments(
+        &self,
+        label_selector: &str,
+    ) -> Result<Vec<String>, Error> {
+        let api: Api<Deployment> = Api::all(self.client.clone());
+        let list = api
+            .list(&ListParams::default().labels(label_selector))
+            .await?;
+
+        let mut healthy_ns: HashMap<String, bool> = HashMap::new();
+        for d in list.items {
+            let ns = match d.metadata.namespace.clone() {
+                Some(n) => n,
+                None => continue,
+            };
+            let available = d
+                .status
+                .as_ref()
+                .and_then(|s| s.available_replicas)
+                .unwrap_or(0);
+            let is_healthy = if available > 0 {
+                true
+            } else {
+                d.status
+                    .as_ref()
+                    .and_then(|s| s.conditions.as_ref())
+                    .map(|c| {
+                        c.iter()
+                            .any(|c| c.type_ == "Available" && c.status == "True")
+                    })
+                    .unwrap_or(false)
+            };
+            if is_healthy {
+                healthy_ns.insert(ns, true);
+            }
+        }
+        Ok(healthy_ns.into_keys().collect())
+    }
+
+    pub async fn get_controller_service_account_name(
+        &self,
+        ns: &str,
+    ) -> Result<Option<String>, Error> {
+        let api: Api<Deployment> = Api::namespaced(self.client.clone(), ns);
+        let list = api
+            .list(&ListParams::default().labels("app.kubernetes.io/component=controller"))
+            .await?;
+        if let Some(dep) = list.items.first() {
+            if let Some(sa) = dep
+                .spec
+                .as_ref()
+                .and_then(|s| s.template.spec.as_ref())
+                .and_then(|s| s.service_account_name.clone())
+            {
+                return Ok(Some(sa));
+            }
+        }
+        Ok(None)
+    }
+
+    pub async fn list_clusterrolebindings_json(&self) -> Result<Vec<Value>, Error> {
+        let gvk = GroupVersionKind::gvk("rbac.authorization.k8s.io", "v1", "ClusterRoleBinding");
+        let ar = ApiResource::from_gvk(&gvk);
+        let api: Api<DynamicObject> = Api::all_with(self.client.clone(), &ar);
+        let list = api.list(&ListParams::default()).await?;
+        Ok(list
+            .items
+            .into_iter()
+            .map(|o| serde_json::to_value(&o).unwrap_or(Value::Null))
+            .collect())
+    }
+
+    pub async fn is_service_account_cluster_wide(&self, sa: &str, ns: &str) -> Result<bool, Error> {
+        let sa_user = format!("system:serviceaccount:{ns}:{sa}");
+        for crb in self.list_clusterrolebindings_json().await? {
+            if let Some(subjects) = crb.get("subjects").and_then(|s| s.as_array()) {
+                for subj in subjects {
+                    let kind = subj.get("kind").and_then(|v| v.as_str()).unwrap_or("");
+                    let name = subj.get("name").and_then(|v| v.as_str()).unwrap_or("");
+                    let subj_ns = subj.get("namespace").and_then(|v| v.as_str()).unwrap_or("");
+                    if (kind == "ServiceAccount" && name == sa && subj_ns == ns)
+                        || (kind == "User" && name == sa_user)
+                    {
+                        return Ok(true);
+                    }
+                }
+            }
+        }
+        Ok(false)
+    }
+
+    pub async fn has_crd(&self, name: &str) -> Result<bool, Error> {
+        let api: Api<CustomResourceDefinition> = Api::all(self.client.clone());
+        let crds = api
+            .list(&ListParams::default().fields(&format!("metadata.name={name}")))
+            .await?;
+        Ok(!crds.items.is_empty())
+    }
+
+    pub async fn service_account_api(&self, namespace: &str) -> Api<ServiceAccount> {
+        Api::namespaced(self.client.clone(), namespace)
+    }
+
+    pub async fn get_resource_json_value(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+        gvk: &GroupVersionKind,
+    ) -> Result<DynamicObject, Error> {
+        let ar = ApiResource::from_gvk(gvk);
+        let api: Api<DynamicObject> = match namespace {
+            Some(ns) => Api::namespaced_with(self.client.clone(), ns, &ar),
+            None => Api::default_namespaced_with(self.client.clone(), &ar),
+        };
+        api.get(name).await
+    }
+
+    pub async fn get_secret_json_value(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+    ) -> Result<DynamicObject, Error> {
+        self.get_resource_json_value(
+            name,
+            namespace,
+            &GroupVersionKind {
+                group: String::new(),
+                version: "v1".to_string(),
+                kind: "Secret".to_string(),
+            },
+        )
+        .await
+    }
+
+    pub async fn get_deployment(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+    ) -> Result<Option<Deployment>, Error> {
+        let api: Api<Deployment> = match namespace {
+            Some(ns) => {
+                debug!("Getting namespaced deployment '{name}' in '{ns}'");
+                Api::namespaced(self.client.clone(), ns)
+            }
+            None => {
+                debug!("Getting deployment '{name}' in default namespace");
+                Api::default_namespaced(self.client.clone())
+            }
+        };
+        api.get_opt(name).await
+    }
+
+    pub async fn scale_deployment(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+        replicas: u32,
+    ) -> Result<(), Error> {
+        let api: Api<Deployment> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        use kube::api::{Patch, PatchParams};
+        use serde_json::json;
+        let patch = json!({ "spec": { "replicas": replicas } });
+        api.patch_scale(name, &PatchParams::default(), &Patch::Merge(&patch))
+            .await?;
+        Ok(())
+    }
+
+    pub async fn delete_deployment(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+    ) -> Result<(), Error> {
+        let api: Api<Deployment> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        api.delete(name, &kube::api::DeleteParams::default())
+            .await?;
+        Ok(())
+    }
+
+    pub async fn wait_until_deployment_ready(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+        timeout: Option<Duration>,
+    ) -> Result<(), String> {
+        let api: Api<Deployment> = match namespace {
+            Some(ns) => Api::namespaced(self.client.clone(), ns),
+            None => Api::default_namespaced(self.client.clone()),
+        };
+        let timeout = timeout.unwrap_or(Duration::from_secs(120));
+        let establish = await_condition(api, name, conditions::is_deployment_completed());
+        tokio::time::timeout(timeout, establish)
+            .await
+            .map(|_| ())
+            .map_err(|_| "Timed out waiting for deployment".to_string())
+    }
+
+    /// Gets a single named resource, using the correct API scope for `K`.
+    pub async fn get_resource<K>(
+        &self,
+        name: &str,
+        namespace: Option<&str>,
+    ) -> Result<Option<K>, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
+        <K as Resource>::Scope: ScopeResolver<K>,
+        <K as Resource>::DynamicType: Default,
+    {
+        let api: Api<K> =
+            <<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
+        api.get_opt(name).await
+    }
+
+    pub async fn list_resources<K>(
+        &self,
+        namespace: Option<&str>,
+        list_params: Option<ListParams>,
+    ) -> Result<ObjectList<K>, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
+        <K as Resource>::Scope: ScopeResolver<K>,
+        <K as Resource>::DynamicType: Default,
+    {
+        let api: Api<K> =
+            <<K as Resource>::Scope as ScopeResolver<K>>::get_api(&self.client, namespace);
+        api.list(&list_params.unwrap_or_default()).await
+    }
+
+    pub async fn list_all_resources_with_labels<K>(&self, labels: &str) -> Result<Vec<K>, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
+        <K as Resource>::DynamicType: Default,
+    {
+        Api::<K>::all(self.client.clone())
+            .list(&ListParams::default().labels(labels))
+            .await
+            .map(|l| l.items)
+    }
+
+    pub async fn get_all_resource_in_all_namespace<K>(&self) -> Result<Vec<K>, Error>
+    where
+        K: Resource + Clone + std::fmt::Debug + DeserializeOwned,
+        <K as Resource>::Scope: ScopeResolver<K>,
+        <K as Resource>::DynamicType: Default,
+    {
+        Api::<K>::all(self.client.clone())
+            .list(&Default::default())
+            .await
+            .map(|l| l.items)
+    }
+
+    pub async fn get_nodes(
+        &self,
+        list_params: Option<ListParams>,
+    ) -> Result<ObjectList<Node>, Error> {
+        self.list_resources(None, list_params).await
+    }
+}
--- a/harmony-k8s/src/types.rs
+++ b/harmony-k8s/src/types.rs
@@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use k8s_openapi::{ClusterResourceScope, NamespaceResourceScope};
+use kube::{Api, Client, Resource};
+use serde::Serialize;
+
+/// Which Kubernetes distribution is running. Detected once at runtime via
+/// [`crate::discovery::K8sClient::get_k8s_distribution`].
+#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
+pub enum KubernetesDistribution {
+    Default,
+    OpenshiftFamily,
+    K3sFamily,
+}
+
+/// A file to be written to a node's filesystem.
+#[derive(Debug, Clone)]
+pub struct NodeFile {
+    /// Absolute path on the host where the file should be written.
+    pub path: String,
+    /// Content of the file.
+    pub content: String,
+    /// UNIX permissions (e.g. `0o600`).
+    pub mode: u32,
+}
+
+/// Options controlling the behaviour of a [`crate::K8sClient::drain_node`] operation.
+#[derive(Debug, Clone)]
+pub struct DrainOptions {
+    /// Evict pods that use `emptyDir` volumes (ephemeral data is lost).
+    /// Equivalent to `kubectl drain --delete-emptydir-data`.
+    pub delete_emptydir_data: bool,
+    /// Silently skip DaemonSet-managed pods instead of blocking the drain.
+    /// Equivalent to `kubectl drain --ignore-daemonsets`.
+    pub ignore_daemonsets: bool,
+    /// Maximum wall-clock time to wait for all evictions to complete.
+    pub timeout: Duration,
+}
+
+impl Default for DrainOptions {
+    fn default() -> Self {
+        Self {
+            delete_emptydir_data: false,
+            ignore_daemonsets: true,
+            timeout: Duration::from_secs(1),
+        }
+    }
+}
+
+impl DrainOptions {
+    pub fn default_ignore_daemonset_delete_emptydir_data() -> Self {
+        Self {
+            delete_emptydir_data: true,
+            ignore_daemonsets: true,
+            ..Self::default()
+        }
+    }
+}
+
+/// Controls how [`crate::K8sClient::apply_with_strategy`] behaves when the
+/// resource already exists (or does not).
+pub enum WriteMode {
+    /// Server-side apply; create if absent, update if present (default).
+    CreateOrUpdate,
+    /// POST only; return an error if the resource already exists.
+    Create,
+    /// Server-side apply only; return an error if the resource does not exist.
+    Update,
+}
+
+// ── Scope resolution trait ───────────────────────────────────────────────────
+
+/// Resolves the correct [`kube::Api`] for a resource type based on its scope
+/// (cluster-wide vs. namespace-scoped).
+pub trait ScopeResolver<K: Resource> {
+    fn get_api(client: &Client, ns: Option<&str>) -> Api<K>;
+}
+
+impl<K> ScopeResolver<K> for ClusterResourceScope
+where
+    K: Resource<Scope = ClusterResourceScope>,
+    <K as Resource>::DynamicType: Default,
+{
+    fn get_api(client: &Client, _ns: Option<&str>) -> Api<K> {
+        Api::all(client.clone())
+    }
+}
+
+impl<K> ScopeResolver<K> for NamespaceResourceScope
+where
+    K: Resource<Scope = NamespaceResourceScope>,
+    <K as Resource>::DynamicType: Default,
+{
+    fn get_api(client: &Client, ns: Option<&str>) -> Api<K> {
+        match ns {
+            Some(ns) => Api::namespaced(client.clone(), ns),
+            None => Api::default_namespaced(client.clone()),
+        }
+    }
+}
--- a/harmony/Cargo.toml
+++ b/harmony/Cargo.toml
@@ -21,6 +21,8 @@ semver = "1.0.23"
 serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
+tokio-retry.workspace = true
+tokio-util.workspace = true
 derive-new.workspace = true
 log.workspace = true
 env_logger.workspace = true
@@ -30,6 +32,8 @@ opnsense-config = { path = "../opnsense-config" }
 opnsense-config-xml = { path = "../opnsense-config-xml" }
 harmony_macros = { path = "../harmony_macros" }
 harmony_types = { path = "../harmony_types" }
+harmony_execution = { path = "../harmony_execution" }
+harmony-k8s = { path = "../harmony-k8s" }
 uuid.workspace = true
 url.workspace = true
 kube = { workspace = true, features = ["derive"] }
@@ -47,7 +51,7 @@ temp-file = "0.1.9"
 convert_case.workspace = true
 email_address = "0.2.9"
 chrono.workspace = true
-fqdn = { version = "0.4.6", features = [
+fqdn = { version = "0.5.2", features = [
  "domain-label-cannot-start-or-end-with-hyphen",
  "domain-label-length-limited-to-63",
  "domain-name-without-special-chars",
@@ -59,7 +63,6 @@ temp-dir = "0.1.14"
 dyn-clone = "1.0.19"
 similar.workspace = true
 futures-util = "0.3.31"
-tokio-util = "0.7.15"
 strum = { version = "0.27.1", features = ["derive"] }
 tempfile.workspace = true
 serde_with = "3.14.0"
@@ -79,6 +82,7 @@ sqlx.workspace = true
 inquire.workspace = true
 brocade = { path = "../brocade" }
 option-ext = "0.2.0"
+rand.workspace = true

 [dev-dependencies]
 pretty_assertions.workspace = true
--- a/harmony/src/domain/config/secret.rs
+++ b/harmony/src/domain/config/secret.rs
@@ -1,20 +1,21 @@
 use harmony_secret_derive::Secret;
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};

-#[derive(Secret, Serialize, Deserialize, Debug, PartialEq)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
 pub struct OPNSenseFirewallCredentials {
    pub username: String,
    pub password: String,
 }

 // TODO we need a better way to handle multiple "instances" of the same secret structure.
-#[derive(Secret, Serialize, Deserialize, Debug, PartialEq)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
 pub struct SshKeyPair {
    pub private: String,
    pub public: String,
 }

-#[derive(Secret, Serialize, Deserialize, Debug, PartialEq)]
+#[derive(Secret, Serialize, Deserialize, JsonSchema, Debug, PartialEq)]
 pub struct RedhatSecret {
    pub pull_secret: String,
 }
--- a/harmony/src/domain/hardware/mod.rs
+++ b/harmony/src/domain/hardware/mod.rs
@@ -108,11 +108,18 @@ impl PhysicalHost {
            };

            let storage_summary = if drive_count > 1 {
+                let drive_sizes = self
+                    .storage
+                    .iter()
+                    .map(|d| format_storage(d.size_bytes))
+                    .collect::<Vec<_>>()
+                    .join(", ");
+
                format!(
-                    "{} Storage ({}x {})",
+                    "{} Storage ({} Disks [{}])",
                    format_storage(total_storage_bytes),
                    drive_count,
-                    first_drive_model
+                    drive_sizes
                )
            } else {
                format!(
--- a/harmony/src/domain/interpret/mod.rs
+++ b/harmony/src/domain/interpret/mod.rs
@@ -4,8 +4,6 @@ use std::error::Error;
 use async_trait::async_trait;
 use derive_new::new;

-use crate::inventory::HostRole;
-
 use super::{
    data::Version, executors::ExecutorError, inventory::Inventory, topology::PreparationError,
 };
--- a/harmony/src/domain/inventory/repository.rs
+++ b/harmony/src/domain/inventory/repository.rs
@@ -1,6 +1,8 @@
 use async_trait::async_trait;

-use crate::{hardware::PhysicalHost, interpret::InterpretError, inventory::HostRole};
+use crate::{
+    hardware::PhysicalHost, interpret::InterpretError, inventory::HostRole, topology::HostConfig,
+};

 /// Errors that can occur within the repository layer.
 #[derive(thiserror::Error, Debug)]
@@ -29,10 +31,14 @@ pub trait InventoryRepository: Send + Sync + 'static {
    async fn save(&self, host: &PhysicalHost) -> Result<(), RepoError>;
    async fn get_latest_by_id(&self, host_id: &str) -> Result<Option<PhysicalHost>, RepoError>;
    async fn get_all_hosts(&self) -> Result<Vec<PhysicalHost>, RepoError>;
-    async fn get_host_for_role(&self, role: &HostRole) -> Result<Vec<PhysicalHost>, RepoError>;
+    async fn get_hosts_for_role(
+        &self,
+        role: &HostRole,
+    ) -> Result<Vec<(PhysicalHost, HostConfig)>, RepoError>;
    async fn save_role_mapping(
        &self,
        role: &HostRole,
        host: &PhysicalHost,
+        installation_device: &String,
    ) -> Result<(), RepoError>;
 }
--- a/harmony/src/domain/topology/decentralized.rs
+++ b/harmony/src/domain/topology/decentralized.rs
@@ -0,0 +1,58 @@
+use async_trait::async_trait;
+use log::info;
+
+use crate::topology::{
+    K8sAnywhereConfig, K8sAnywhereTopology, PreparationError, PreparationOutcome, Topology,
+};
+
+pub struct DecentralizedTopology<T> {
+    pub sites: Vec<T>,
+}
+
+#[async_trait]
+impl<T: Topology + Send + Sync> Topology for DecentralizedTopology<T> {
+    fn name(&self) -> &str {
+        "DecentralizedTopology"
+    }
+
+    async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
+        let mut details = Vec::new();
+
+        for site in &self.sites {
+            let outcome = site.ensure_ready().await?;
+            match outcome {
+                PreparationOutcome::Success { details: d } => {
+                    details.push(d);
+                }
+                PreparationOutcome::Noop => {
+                    details.push("site ready Noop".to_string());
+                    info!("site ready");
+                }
+            }
+        }
+
+        Ok(PreparationOutcome::Success {
+            details: details.join(","),
+        })
+    }
+}
+
+impl DecentralizedTopology<K8sAnywhereTopology> {
+    pub fn from_env() -> Self {
+        let mut sites = Vec::new();
+
+        for i in 1.. {
+            let var = format!("HARMONY_DECENTRALIZED_TOPOLOGY_K8S_SITE_{}", i);
+
+            match std::env::var(&var) {
+                Ok(_) => {
+                    let cfg = K8sAnywhereConfig::remote_k8s_from_env_var(&var);
+                    sites.push(K8sAnywhereTopology::with_config(cfg));
+                }
+                Err(_) => break,
+            }
+        }
+
+        Self { sites }
+    }
+}
--- a/harmony/src/domain/topology/ha_cluster.rs
+++ b/harmony/src/domain/topology/ha_cluster.rs
@@ -1,5 +1,5 @@
 use async_trait::async_trait;
-use brocade::PortOperatingMode;
+use harmony_k8s::K8sClient;
 use harmony_macros::ip;
 use harmony_types::{
    id::Id,
@@ -9,17 +9,20 @@ use harmony_types::{
 use log::debug;
 use log::info;

+use crate::topology::{HelmCommand, PxeOptions};
 use crate::{data::FileContent, executors::ExecutorError, topology::node_exporter::NodeExporter};
 use crate::{infra::network_manager::OpenShiftNmStateNetworkManager, topology::PortConfig};
-use crate::{modules::inventory::HarmonyDiscoveryStrategy, topology::PxeOptions};

 use super::{
    DHCPStaticEntry, DhcpServer, DnsRecord, DnsRecordType, DnsServer, Firewall, HostNetworkConfig,
    HttpServer, IpAddress, K8sclient, LoadBalancer, LoadBalancerService, LogicalHost, NetworkError,
    NetworkManager, PreparationError, PreparationOutcome, Router, Switch, SwitchClient,
-    SwitchError, TftpServer, Topology, k8s::K8sClient,
+    SwitchError, TftpServer, Topology,
+};
+use std::{
+    process::Command,
+    sync::{Arc, OnceLock},
 };
-use std::sync::{Arc, OnceLock};

 #[derive(Debug, Clone)]
 pub struct HAClusterTopology {
@@ -53,6 +56,30 @@ impl Topology for HAClusterTopology {
    }
 }

+impl HelmCommand for HAClusterTopology {
+    fn get_helm_command(&self) -> Command {
+        let mut cmd = Command::new("helm");
+        if let Some(k) = &self.kubeconfig {
+            cmd.args(["--kubeconfig", k]);
+        }
+
+        // FIXME we should support context anywhere there is a k8sclient
+        // This likely belongs in the k8sclient itself and should be extracted to a separate
+        // crate
+        //
+        // I feel like helm could very well be a feature of this external k8s client.
+        //
+        // Same for kustomize
+        //
+        // if let Some(c) = &self.k8s_context {
+        //     cmd.args(["--kube-context", c]);
+        // }
+
+        info!("Using helm command {cmd:?}");
+        cmd
+    }
+}
+
 #[async_trait]
 impl K8sclient for HAClusterTopology {
    async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
@@ -301,10 +328,10 @@ impl Switch for HAClusterTopology {
        Ok(())
    }

-    async fn clear_port_channel(&self, ids: &Vec<Id>) -> Result<(), SwitchError> {
+    async fn clear_port_channel(&self, _ids: &Vec<Id>) -> Result<(), SwitchError> {
        todo!()
    }
-    async fn configure_interface(&self, ports: &Vec<PortConfig>) -> Result<(), SwitchError> {
+    async fn configure_interface(&self, _ports: &Vec<PortConfig>) -> Result<(), SwitchError> {
        todo!()
    }
 }
@@ -322,7 +349,15 @@ impl NetworkManager for HAClusterTopology {
        self.network_manager().await.configure_bond(config).await
    }

-    //TODO add snmp here
+    async fn configure_bond_on_primary_interface(
+        &self,
+        config: &HostNetworkConfig,
+    ) -> Result<(), NetworkError> {
+        self.network_manager()
+            .await
+            .configure_bond_on_primary_interface(config)
+            .await
+    }
 }

 #[async_trait]
@@ -562,10 +597,10 @@ impl SwitchClient for DummyInfra {
    ) -> Result<u8, SwitchError> {
        unimplemented!("{}", UNIMPLEMENTED_DUMMY_INFRA)
    }
-    async fn clear_port_channel(&self, ids: &Vec<Id>) -> Result<(), SwitchError> {
+    async fn clear_port_channel(&self, _ids: &Vec<Id>) -> Result<(), SwitchError> {
        todo!()
    }
-    async fn configure_interface(&self, ports: &Vec<PortConfig>) -> Result<(), SwitchError> {
+    async fn configure_interface(&self, _ports: &Vec<PortConfig>) -> Result<(), SwitchError> {
        todo!()
    }
 }
--- a/harmony/src/domain/topology/helm_command.rs
+++ b/harmony/src/domain/topology/helm_command.rs
@@ -1 +1,5 @@
-pub trait HelmCommand {}
+use std::process::Command;
+
+pub trait HelmCommand {
+    fn get_helm_command(&self) -> Command;
+}
--- a/harmony/src/domain/topology/host_binding.rs
+++ b/harmony/src/domain/topology/host_binding.rs
@@ -7,12 +7,17 @@ use super::LogicalHost;

 /// Represents the binding between a LogicalHost and a PhysicalHost.
 ///
+///
 /// This is the only construct that directly maps a logical host to a physical host.
 /// It serves as a bridge between the logical cluster structure and the physical infrastructure.
 #[derive(Debug, new, Clone, Serialize)]
 pub struct HostBinding {
-    /// Reference to the LogicalHost
    pub logical_host: LogicalHost,
-    /// Reference to the PhysicalHost
    pub physical_host: PhysicalHost,
+    pub host_config: HostConfig,
+}
+
+#[derive(Debug, new, Clone, Serialize)]
+pub struct HostConfig {
+    pub installation_device: Option<String>,
 }
--- a/harmony/src/domain/topology/k8s.rs
+++ b/harmony/src/domain/topology/k8s.rs
--- a/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs
+++ b/harmony/src/domain/topology/k8s_anywhere/k8s_anywhere.rs
@@ -1,45 +1,35 @@
-use std::{collections::BTreeMap, process::Command, sync::Arc, time::Duration};
+use std::{collections::BTreeMap, process::Command, sync::Arc};

 use async_trait::async_trait;
 use base64::{Engine, engine::general_purpose};
+use harmony_k8s::{K8sClient, KubernetesDistribution};
 use harmony_types::rfc1123::Rfc1123Name;
 use k8s_openapi::api::{
-    core::v1::Secret,
+    core::v1::{Pod, Secret},
    rbac::v1::{ClusterRoleBinding, RoleRef, Subject},
 };
-use kube::api::{DynamicObject, GroupVersionKind, ObjectMeta};
+use kube::api::{GroupVersionKind, ObjectMeta};
 use log::{debug, info, trace, warn};
 use serde::Serialize;
 use tokio::sync::OnceCell;

 use crate::{
    executors::ExecutorError,
-    interpret::InterpretStatus,
+    interpret::{InterpretStatus, Outcome},
    inventory::Inventory,
    modules::{
-        k3d::K3DInstallationScore,
-        k8s::ingress::{K8sIngressScore, PathType},
-        monitoring::{
-            grafana::{grafana::Grafana, helm::helm_grafana::grafana_helm_chart_score},
-            kube_prometheus::crd::{
-                crd_alertmanager_config::CRDPrometheus,
-                crd_grafana::{
-                    Grafana as GrafanaCRD, GrafanaCom, GrafanaDashboard,
-                    GrafanaDashboardDatasource, GrafanaDashboardSpec, GrafanaDatasource,
-                    GrafanaDatasourceConfig, GrafanaDatasourceJsonData,
-                    GrafanaDatasourceSecureJsonData, GrafanaDatasourceSpec, GrafanaSpec,
-                },
-                crd_prometheuses::LabelSelector,
-                prometheus_operator::prometheus_operator_helm_chart_score,
-                rhob_alertmanager_config::RHOBObservability,
-                service_monitor::ServiceMonitor,
+        cert_manager::{
+            capability::{CertificateManagement, CertificateManagementConfig},
+            crd::{
+                certificate::Certificate, issuer::Issuer,
+                score_k8s_certificate::K8sCertManagerCertificateScore,
+                score_k8s_issuer::K8sCertManagerIssuerScore,
            },
+            operator::CertManagerOperatorScore,
+            score_cert_management::CertificateManagementScore,
        },
-        okd::route::OKDTlsPassthroughScore,
-        prometheus::{
-            k8s_prometheus_alerting_score::K8sPrometheusCRDAlertingScore,
-            prometheus::PrometheusMonitoring, rhob_alerting_score::RHOBAlertingScore,
-        },
+        k3d::K3DInstallationScore,
+        okd::{crd::ingresses_config::Ingress as IngressResource, route::OKDTlsPassthroughScore},
    },
    score::Score,
    topology::{TlsRoute, TlsRouter, ingress::Ingress},
@@ -48,8 +38,6 @@ use crate::{
 use super::super::{
    DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, PreparationError,
    PreparationOutcome, Topology,
-    k8s::K8sClient,
-    oberservability::monitoring::AlertReceiver,
    tenant::{
        TenantConfig, TenantManager,
        k8s::K8sTenantManager,
@@ -66,13 +54,6 @@ struct K8sState {
    message: String,
 }

-#[derive(Debug, Clone)]
-pub enum KubernetesDistribution {
-    OpenshiftFamily,
-    K3sFamily,
-    Default,
-}
-
 #[derive(Debug, Clone)]
 enum K8sSource {
    LocalK3d,
@@ -83,7 +64,6 @@ enum K8sSource {
 pub struct K8sAnywhereTopology {
    k8s_state: Arc<OnceCell<Option<K8sState>>>,
    tenant_manager: Arc<OnceCell<K8sTenantManager>>,
-    k8s_distribution: Arc<OnceCell<KubernetesDistribution>>,
    config: Arc<K8sAnywhereConfig>,
 }

@@ -107,8 +87,32 @@ impl K8sclient for K8sAnywhereTopology {

 #[async_trait]
 impl TlsRouter for K8sAnywhereTopology {
-    async fn get_wildcard_domain(&self) -> Result<Option<String>, String> {
-        todo!()
+    async fn get_internal_domain(&self) -> Result<Option<String>, String> {
+        match self.get_k8s_distribution().await.map_err(|e| {
+            format!(
+                "Could not get internal domain, error getting k8s distribution : {}",
+                e.to_string()
+            )
+        })? {
+            KubernetesDistribution::OpenshiftFamily => {
+                let client = self.k8s_client().await?;
+                if let Some(ingress_config) = client
+                    .get_resource::<IngressResource>("cluster", None)
+                    .await
+                    .map_err(|e| {
+                        format!("Error attempting to get ingress config : {}", e.to_string())
+                    })?
+                {
+                    debug!("Found ingress config {:?}", ingress_config.spec);
+                    Ok(ingress_config.spec.domain.clone())
+                } else {
+                    warn!("Could not find a domain configured in this cluster");
+                    Ok(None)
+                }
+            }
+            KubernetesDistribution::K3sFamily => todo!(),
+            KubernetesDistribution::Default => todo!(),
+        }
    }

    /// Returns the port that this router exposes externally.
@@ -140,216 +144,6 @@ impl TlsRouter for K8sAnywhereTopology {
    }
 }

-#[async_trait]
-impl Grafana for K8sAnywhereTopology {
-    async fn ensure_grafana_operator(
-        &self,
-        inventory: &Inventory,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        debug!("ensure grafana operator");
-        let client = self.k8s_client().await.unwrap();
-        let grafana_gvk = GroupVersionKind {
-            group: "grafana.integreatly.org".to_string(),
-            version: "v1beta1".to_string(),
-            kind: "Grafana".to_string(),
-        };
-        let name = "grafanas.grafana.integreatly.org";
-        let ns = "grafana";
-
-        let grafana_crd = client
-            .get_resource_json_value(name, Some(ns), &grafana_gvk)
-            .await;
-        match grafana_crd {
-            Ok(_) => {
-                return Ok(PreparationOutcome::Success {
-                    details: "Found grafana CRDs in cluster".to_string(),
-                });
-            }
-
-            Err(_) => {
-                return self
-                    .install_grafana_operator(inventory, Some("grafana"))
-                    .await;
-            }
-        };
-    }
-    async fn install_grafana(&self) -> Result<PreparationOutcome, PreparationError> {
-        let ns = "grafana";
-
-        let mut label = BTreeMap::new();
-
-        label.insert("dashboards".to_string(), "grafana".to_string());
-
-        let label_selector = LabelSelector {
-            match_labels: label.clone(),
-            match_expressions: vec![],
-        };
-
-        let client = self.k8s_client().await?;
-
-        let grafana = self.build_grafana(ns, &label);
-
-        client.apply(&grafana, Some(ns)).await?;
-        //TODO change this to a ensure ready or something better than just a timeout
-        client
-            .wait_until_deployment_ready(
-                "grafana-grafana-deployment",
-                Some("grafana"),
-                Some(Duration::from_secs(30)),
-            )
-            .await?;
-
-        let sa_name = "grafana-grafana-sa";
-        let token_secret_name = "grafana-sa-token-secret";
-
-        let sa_token_secret = self.build_sa_token_secret(token_secret_name, sa_name, ns);
-
-        client.apply(&sa_token_secret, Some(ns)).await?;
-        let secret_gvk = GroupVersionKind {
-            group: "".to_string(),
-            version: "v1".to_string(),
-            kind: "Secret".to_string(),
-        };
-
-        let secret = client
-            .get_resource_json_value(token_secret_name, Some(ns), &secret_gvk)
-            .await?;
-
-        let token = format!(
-            "Bearer {}",
-            self.extract_and_normalize_token(&secret).unwrap()
-        );
-
-        debug!("creating grafana clusterrole binding");
-
-        let clusterrolebinding =
-            self.build_cluster_rolebinding(sa_name, "cluster-monitoring-view", ns);
-
-        client.apply(&clusterrolebinding, Some(ns)).await?;
-
-        debug!("creating grafana datasource crd");
-
-        let thanos_url = format!(
-            "https://{}",
-            self.get_domain("thanos-querier-openshift-monitoring")
-                .await
-                .unwrap()
-        );
-
-        let thanos_openshift_datasource = self.build_grafana_datasource(
-            "thanos-openshift-monitoring",
-            ns,
-            &label_selector,
-            &thanos_url,
-            &token,
-        );
-
-        client.apply(&thanos_openshift_datasource, Some(ns)).await?;
-
-        debug!("creating grafana dashboard crd");
-        let dashboard = self.build_grafana_dashboard(ns, &label_selector);
-
-        client.apply(&dashboard, Some(ns)).await?;
-        debug!("creating grafana ingress");
-        let grafana_ingress = self.build_grafana_ingress(ns).await;
-
-        grafana_ingress
-            .interpret(&Inventory::empty(), self)
-            .await
-            .map_err(|e| PreparationError::new(e.to_string()))?;
-
-        Ok(PreparationOutcome::Success {
-            details: "Installed grafana composants".to_string(),
-        })
-    }
-}
-
-#[async_trait]
-impl PrometheusMonitoring<CRDPrometheus> for K8sAnywhereTopology {
-    async fn install_prometheus(
-        &self,
-        sender: &CRDPrometheus,
-        _inventory: &Inventory,
-        _receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let client = self.k8s_client().await?;
-
-        for monitor in sender.service_monitor.iter() {
-            client
-                .apply(monitor, Some(&sender.namespace))
-                .await
-                .map_err(|e| PreparationError::new(e.to_string()))?;
-        }
-        Ok(PreparationOutcome::Success {
-            details: "successfuly installed prometheus components".to_string(),
-        })
-    }
-
-    async fn ensure_prometheus_operator(
-        &self,
-        sender: &CRDPrometheus,
-        _inventory: &Inventory,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let po_result = self.ensure_prometheus_operator(sender).await?;
-
-        match po_result {
-            PreparationOutcome::Success { details: _ } => {
-                debug!("Detected prometheus crds operator present in cluster.");
-                return Ok(po_result);
-            }
-            PreparationOutcome::Noop => {
-                debug!("Skipping Prometheus CR installation due to missing operator.");
-                return Ok(po_result);
-            }
-        }
-    }
-}
-
-#[async_trait]
-impl PrometheusMonitoring<RHOBObservability> for K8sAnywhereTopology {
-    async fn install_prometheus(
-        &self,
-        sender: &RHOBObservability,
-        inventory: &Inventory,
-        receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let po_result = self.ensure_cluster_observability_operator(sender).await?;
-
-        if po_result == PreparationOutcome::Noop {
-            debug!("Skipping Prometheus CR installation due to missing operator.");
-            return Ok(po_result);
-        }
-
-        let result = self
-            .get_cluster_observability_operator_prometheus_application_score(
-                sender.clone(),
-                receivers,
-            )
-            .await
-            .interpret(inventory, self)
-            .await;
-
-        match result {
-            Ok(outcome) => match outcome.status {
-                InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
-                    details: outcome.message,
-                }),
-                InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
-                _ => Err(PreparationError::new(outcome.message)),
-            },
-            Err(err) => Err(PreparationError::new(err.to_string())),
-        }
-    }
-
-    async fn ensure_prometheus_operator(
-        &self,
-        sender: &RHOBObservability,
-        inventory: &Inventory,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        todo!()
-    }
-}
-
 impl Serialize for K8sAnywhereTopology {
    fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
    where
@@ -359,12 +153,157 @@ impl Serialize for K8sAnywhereTopology {
    }
 }

+#[async_trait]
+impl CertificateManagement for K8sAnywhereTopology {
+    async fn install(&self) -> Result<Outcome, ExecutorError> {
+        let cert_management_operator = CertManagerOperatorScore::default();
+
+        cert_management_operator
+            .interpret(&Inventory::empty(), self)
+            .await
+            .map_err(|e| ExecutorError::UnexpectedError(e.to_string()))?;
+
+        Ok(Outcome::success(format!(
+            "Installed cert-manager into ns: {}",
+            cert_management_operator.namespace
+        )))
+    }
+
+    async fn ensure_certificate_management_ready(&self) -> Result<Outcome, ExecutorError> {
+        let k8s_client = self.k8s_client().await.unwrap();
+        let pods = k8s_client
+            .list_all_resources_with_labels::<Pod>(
+                "app.kubernetes.io/component=controller,\
+         app.kubernetes.io/name=cert-manager",
+            )
+            .await
+            .map_err(|e| ExecutorError::UnexpectedError(format!("{}", e)))?;
+
+        if pods.is_empty() {
+            info!("cert-manager not installed (no controller pods found)");
+            self.install().await
+        } else {
+            trace!("cert-manager controller pods found: {:#?}", pods);
+            info!("cert-manager controller pods found");
+            Ok(Outcome::success("Certificate Management Ready".to_string()))
+        }
+    }
+
+    async fn create_issuer(
+        &self,
+        issuer_name: String,
+        config: &CertificateManagementConfig,
+    ) -> Result<Outcome, ExecutorError> {
+        let issuer_score = K8sCertManagerIssuerScore {
+            issuer_name: issuer_name.clone(),
+            config: config.clone(),
+        };
+
+        issuer_score
+            .interpret(&Inventory::empty(), self)
+            .await
+            .map_err(|e| ExecutorError::UnexpectedError(e.to_string()))?;
+
+        Ok(Outcome::success(format!(
+            "issuer of kind {} is ready",
+            issuer_name
+        )))
+    }
+
+    async fn create_certificate(
+        &self,
+        cert_name: String,
+        issuer_name: String,
+        common_name: Option<String>,
+        dns_names: Option<Vec<String>>,
+        is_ca: Option<bool>,
+        config: &CertificateManagementConfig,
+    ) -> Result<Outcome, ExecutorError> {
+        self.certificate_issuer_ready(
+            issuer_name.clone(),
+            self.k8s_client().await.unwrap(),
+            config,
+        )
+        .await?;
+
+        let cert = K8sCertManagerCertificateScore {
+            cert_name: cert_name,
+            issuer_name,
+            common_name,
+            is_ca,
+            dns_names,
+            config: config.clone(),
+        };
+        cert.interpret(&Inventory::empty(), self)
+            .await
+            .map_err(|e| ExecutorError::UnexpectedError(e.to_string()))?;
+
+        Ok(Outcome::success(format!(
+            "Created cert into ns: {:#?}",
+            config.namespace.clone()
+        )))
+    }
+
+    async fn get_ca_certificate(
+        &self,
+        cert_name: String,
+        config: &CertificateManagementConfig,
+    ) -> Result<String, ExecutorError> {
+        let namespace = config.namespace.clone().unwrap();
+
+        let client = self.k8s_client().await.unwrap();
+
+        if let Some(certificate) = client
+            .get_resource::<Certificate>(&cert_name, Some(&namespace))
+            .await
+            .map_err(|e| ExecutorError::UnexpectedError(format!("{}", e)))?
+        {
+            let secret_name = certificate.spec.secret_name.clone();
+
+            debug!("Secret Name {:#?}", secret_name);
+            if let Some(secret) = client
+                .get_resource::<Secret>(&secret_name, Some(&namespace))
+                .await
+                .map_err(|e| {
+                    ExecutorError::UnexpectedError(format!(
+                        "secret {} not found in namespace {}: {}",
+                        secret_name, namespace, e
+                    ))
+                })?
+            {
+                let ca_cert = secret
+                    .data
+                    .as_ref()
+                    .and_then(|d| d.get("ca.crt"))
+                    .ok_or_else(|| {
+                        ExecutorError::UnexpectedError("Secret missing key 'ca.crt'".into())
+                    })?;
+
+                let ca_cert = String::from_utf8(ca_cert.0.clone()).map_err(|_| {
+                    ExecutorError::UnexpectedError("ca.crt is not valid UTF-8".into())
+                })?;
+
+                return Ok(ca_cert);
+            } else {
+                Err(ExecutorError::UnexpectedError(format!(
+                    "Error getting secret associated with cert_name: {}, secret_name: {}",
+                    cert_name, secret_name
+                )))
+            }
+        } else {
+            return Err(ExecutorError::UnexpectedError(format!(
+                "Certificate {} not found in namespace {}",
+                cert_name, namespace
+            )));
+        }
+    }
+}
+
 impl K8sAnywhereTopology {
    pub fn from_env() -> Self {
        Self {
            k8s_state: Arc::new(OnceCell::new()),
            tenant_manager: Arc::new(OnceCell::new()),
-            k8s_distribution: Arc::new(OnceCell::new()),
            config: Arc::new(K8sAnywhereConfig::from_env()),
        }
    }
@@ -373,61 +312,50 @@ impl K8sAnywhereTopology {
        Self {
            k8s_state: Arc::new(OnceCell::new()),
            tenant_manager: Arc::new(OnceCell::new()),
-            k8s_distribution: Arc::new(OnceCell::new()),
            config: Arc::new(config),
        }
    }

-    pub async fn get_k8s_distribution(&self) -> Result<&KubernetesDistribution, PreparationError> {
-        self.k8s_distribution
-            .get_or_try_init(async || {
-                debug!("Trying to detect k8s distribution");
-                let client = self.k8s_client().await.unwrap();
+    pub async fn certificate_issuer_ready(
+        &self,
+        issuer_name: String,
+        k8s_client: Arc<K8sClient>,
+        config: &CertificateManagementConfig,
+    ) -> Result<Outcome, ExecutorError> {
+        let ns = config
+            .namespace
+            .clone()
+            .ok_or_else(|| ExecutorError::UnexpectedError("namespace is required".to_string()))?;

-                let discovery = client.discovery().await.map_err(|e| {
-                    PreparationError::new(format!("Could not discover API groups: {}", e))
-                })?;
-
-                let version = client.get_apiserver_version().await.map_err(|e| {
-                    PreparationError::new(format!("Could not get server version: {}", e))
-                })?;
-
-                // OpenShift / OKD
-                if discovery
-                    .groups()
-                    .any(|g| g.name() == "project.openshift.io")
-                {
-                    info!("Found KubernetesDistribution OpenshiftFamily");
-                    return Ok(KubernetesDistribution::OpenshiftFamily);
-                }
-
-                // K3d / K3s
-                if version.git_version.contains("k3s") {
-                    info!("Found KubernetesDistribution K3sFamily");
-                    return Ok(KubernetesDistribution::K3sFamily);
-                }
-
-                info!("Could not identify KubernetesDistribution, using Default");
-                return Ok(KubernetesDistribution::Default);
-            })
+        match k8s_client
+            .get_resource::<Issuer>(&issuer_name, Some(&ns))
            .await
+        {
+            Ok(Some(_cert_issuer)) => Ok(Outcome::success(format!(
+                "issuer of kind {} is ready",
+                issuer_name
+            ))),
+
+            Ok(None) => Err(ExecutorError::UnexpectedError(format!(
+                "Issuer {} not present in namespace {}",
+                issuer_name, ns
+            ))),
+
+            Err(e) => Err(ExecutorError::UnexpectedError(format!(
+                "Failed to fetch Issuer {}: {}",
+                issuer_name, e
+            ))),
+        }
    }

-    fn extract_and_normalize_token(&self, secret: &DynamicObject) -> Option<String> {
-        let token_b64 = secret
-            .data
-            .get("token")
-            .or_else(|| secret.data.get("data").and_then(|d| d.get("token")))
-            .and_then(|v| v.as_str())?;
-
-        let bytes = general_purpose::STANDARD.decode(token_b64).ok()?;
-
-        let s = String::from_utf8(bytes).ok()?;
-
-        let cleaned = s
-            .trim_matches(|c: char| c.is_whitespace() || c == '\0')
-            .to_string();
-        Some(cleaned)
+    pub async fn get_k8s_distribution(&self) -> Result<KubernetesDistribution, PreparationError> {
+        self.k8s_client()
+            .await?
+            .get_k8s_distribution()
+            .await
+            .map_err(|e| {
+                PreparationError::new(format!("Failed to get k8s distribution from client : {e}"))
+            })
    }

    pub fn build_cluster_rolebinding(
@@ -479,141 +407,6 @@ impl K8sAnywhereTopology {
        }
    }

-    fn build_grafana_datasource(
-        &self,
-        name: &str,
-        ns: &str,
-        label_selector: &LabelSelector,
-        url: &str,
-        token: &str,
-    ) -> GrafanaDatasource {
-        let mut json_data = BTreeMap::new();
-        json_data.insert("timeInterval".to_string(), "5s".to_string());
-
-        GrafanaDatasource {
-            metadata: ObjectMeta {
-                name: Some(name.to_string()),
-                namespace: Some(ns.to_string()),
-                ..Default::default()
-            },
-            spec: GrafanaDatasourceSpec {
-                instance_selector: label_selector.clone(),
-                allow_cross_namespace_import: Some(true),
-                values_from: None,
-                datasource: GrafanaDatasourceConfig {
-                    access: "proxy".to_string(),
-                    name: name.to_string(),
-                    r#type: "prometheus".to_string(),
-                    url: url.to_string(),
-                    database: None,
-                    json_data: Some(GrafanaDatasourceJsonData {
-                        time_interval: Some("60s".to_string()),
-                        http_header_name1: Some("Authorization".to_string()),
-                        tls_skip_verify: Some(true),
-                        oauth_pass_thru: Some(true),
-                    }),
-                    secure_json_data: Some(GrafanaDatasourceSecureJsonData {
-                        http_header_value1: Some(format!("Bearer {token}")),
-                    }),
-                    is_default: Some(false),
-                    editable: Some(true),
-                },
-            },
-        }
-    }
-
-    fn build_grafana_dashboard(
-        &self,
-        ns: &str,
-        label_selector: &LabelSelector,
-    ) -> GrafanaDashboard {
-        let graf_dashboard = GrafanaDashboard {
-            metadata: ObjectMeta {
-                name: Some(format!("grafana-dashboard-{}", ns)),
-                namespace: Some(ns.to_string()),
-                ..Default::default()
-            },
-            spec: GrafanaDashboardSpec {
-                resync_period: Some("30s".to_string()),
-                instance_selector: label_selector.clone(),
-                datasources: Some(vec![GrafanaDashboardDatasource {
-                    input_name: "DS_PROMETHEUS".to_string(),
-                    datasource_name: "thanos-openshift-monitoring".to_string(),
-                }]),
-                json: None,
-                grafana_com: Some(GrafanaCom {
-                    id: 17406,
-                    revision: None,
-                }),
-            },
-        };
-        graf_dashboard
-    }
-
-    fn build_grafana(&self, ns: &str, labels: &BTreeMap<String, String>) -> GrafanaCRD {
-        let grafana = GrafanaCRD {
-            metadata: ObjectMeta {
-                name: Some(format!("grafana-{}", ns)),
-                namespace: Some(ns.to_string()),
-                labels: Some(labels.clone()),
-                ..Default::default()
-            },
-            spec: GrafanaSpec {
-                config: None,
-                admin_user: None,
-                admin_password: None,
-                ingress: None,
-                persistence: None,
-                resources: None,
-            },
-        };
-        grafana
-    }
-
-    async fn build_grafana_ingress(&self, ns: &str) -> K8sIngressScore {
-        let domain = self.get_domain(&format!("grafana-{}", ns)).await.unwrap();
-        let name = format!("{}-grafana", ns);
-        let backend_service = format!("grafana-{}-service", ns);
-
-        K8sIngressScore {
-            name: fqdn::fqdn!(&name),
-            host: fqdn::fqdn!(&domain),
-            backend_service: fqdn::fqdn!(&backend_service),
-            port: 3000,
-            path: Some("/".to_string()),
-            path_type: Some(PathType::Prefix),
-            namespace: Some(fqdn::fqdn!(&ns)),
-            ingress_class_name: Some("openshift-default".to_string()),
-        }
-    }
-
-    async fn get_cluster_observability_operator_prometheus_application_score(
-        &self,
-        sender: RHOBObservability,
-        receivers: Option<Vec<Box<dyn AlertReceiver<RHOBObservability>>>>,
-    ) -> RHOBAlertingScore {
-        RHOBAlertingScore {
-            sender,
-            receivers: receivers.unwrap_or_default(),
-            service_monitors: vec![],
-            prometheus_rules: vec![],
-        }
-    }
-
-    async fn get_k8s_prometheus_application_score(
-        &self,
-        sender: CRDPrometheus,
-        receivers: Option<Vec<Box<dyn AlertReceiver<CRDPrometheus>>>>,
-        service_monitors: Option<Vec<ServiceMonitor>>,
-    ) -> K8sPrometheusCRDAlertingScore {
-        return K8sPrometheusCRDAlertingScore {
-            sender,
-            receivers: receivers.unwrap_or_default(),
-            service_monitors: service_monitors.unwrap_or_default(),
-            prometheus_rules: vec![],
-        };
-    }
-
    async fn openshift_ingress_operator_available(&self) -> Result<(), PreparationError> {
        let client = self.k8s_client().await?;
        let gvk = GroupVersionKind {
@@ -779,137 +572,6 @@ impl K8sAnywhereTopology {
            )),
        }
    }
-
-    async fn ensure_cluster_observability_operator(
-        &self,
-        sender: &RHOBObservability,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let status = Command::new("sh")
-            .args(["-c", "kubectl get crd -A | grep -i rhobs"])
-            .status()
-            .map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?;
-
-        if !status.success() {
-            if let Some(Some(k8s_state)) = self.k8s_state.get() {
-                match k8s_state.source {
-                    K8sSource::LocalK3d => {
-                        warn!(
-                            "Installing observability operator is not supported on LocalK3d source"
-                        );
-                        return Ok(PreparationOutcome::Noop);
-                        debug!("installing cluster observability operator");
-                        todo!();
-                        let op_score =
-                            prometheus_operator_helm_chart_score(sender.namespace.clone());
-                        let result = op_score.interpret(&Inventory::empty(), self).await;
-
-                        return match result {
-                            Ok(outcome) => match outcome.status {
-                                InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
-                                    details: "installed cluster observability operator".into(),
-                                }),
-                                InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
-                                _ => Err(PreparationError::new(
-                                    "failed to install cluster observability operator (unknown error)".into(),
-                                )),
-                            },
-                            Err(err) => Err(PreparationError::new(err.to_string())),
-                        };
-                    }
-                    K8sSource::Kubeconfig => {
-                        debug!(
-                            "unable to install cluster observability operator, contact cluster admin"
-                        );
-                        return Ok(PreparationOutcome::Noop);
-                    }
-                }
-            } else {
-                warn!(
-                    "Unable to detect k8s_state. Skipping Cluster Observability Operator install."
-                );
-                return Ok(PreparationOutcome::Noop);
-            }
-        }
-
-        debug!("Cluster Observability Operator is already present, skipping install");
-
-        Ok(PreparationOutcome::Success {
-            details: "cluster observability operator present in cluster".into(),
-        })
-    }
-
-    async fn ensure_prometheus_operator(
-        &self,
-        sender: &CRDPrometheus,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let status = Command::new("sh")
-            .args(["-c", "kubectl get crd -A | grep -i prometheuses"])
-            .status()
-            .map_err(|e| PreparationError::new(format!("could not connect to cluster: {}", e)))?;
-
-        if !status.success() {
-            if let Some(Some(k8s_state)) = self.k8s_state.get() {
-                match k8s_state.source {
-                    K8sSource::LocalK3d => {
-                        debug!("installing prometheus operator");
-                        let op_score =
-                            prometheus_operator_helm_chart_score(sender.namespace.clone());
-                        let result = op_score.interpret(&Inventory::empty(), self).await;
-
-                        return match result {
-                            Ok(outcome) => match outcome.status {
-                                InterpretStatus::SUCCESS => Ok(PreparationOutcome::Success {
-                                    details: "installed prometheus operator".into(),
-                                }),
-                                InterpretStatus::NOOP => Ok(PreparationOutcome::Noop),
-                                _ => Err(PreparationError::new(
-                                    "failed to install prometheus operator (unknown error)".into(),
-                                )),
-                            },
-                            Err(err) => Err(PreparationError::new(err.to_string())),
-                        };
-                    }
-                    K8sSource::Kubeconfig => {
-                        debug!("unable to install prometheus operator, contact cluster admin");
-                        return Ok(PreparationOutcome::Noop);
-                    }
-                }
-            } else {
-                warn!("Unable to detect k8s_state. Skipping Prometheus Operator install.");
-                return Ok(PreparationOutcome::Noop);
-            }
-        }
-
-        debug!("Prometheus operator is already present, skipping install");
-
-        Ok(PreparationOutcome::Success {
-            details: "prometheus operator present in cluster".into(),
-        })
-    }
-
-    async fn install_grafana_operator(
-        &self,
-        inventory: &Inventory,
-        ns: Option<&str>,
-    ) -> Result<PreparationOutcome, PreparationError> {
-        let namespace = ns.unwrap_or("grafana");
-        info!("installing grafana operator in ns {namespace}");
-        let tenant = self.get_k8s_tenant_manager()?.get_tenant_config().await;
-        let mut namespace_scope = false;
-        if tenant.is_some() {
-            namespace_scope = true;
-        }
-        let _grafana_operator_score = grafana_helm_chart_score(namespace, namespace_scope)
-            .interpret(inventory, self)
-            .await
-            .map_err(|e| PreparationError::new(e.to_string()));
-        Ok(PreparationOutcome::Success {
-            details: format!(
-                "Successfully installed grafana operator in ns {}",
-                ns.unwrap()
-            ),
-        })
-    }
 }

 #[derive(Clone, Debug)]
@@ -1064,6 +726,12 @@ impl Topology for K8sAnywhereTopology {
            .await
            .map_err(PreparationError::new)?;

+        let cert_mgmt = CertificateManagementScore {};
+        cert_mgmt
+            .interpret(&Inventory::empty(), self)
+            .await
+            .map_err(|e| PreparationError::new(format!("{}", e)))?;
+
        match self.is_helm_available() {
            Ok(()) => Ok(PreparationOutcome::Success {
                details: format!("{} + helm available", k8s_state.message.clone()),
@@ -1087,7 +755,21 @@ impl MultiTargetTopology for K8sAnywhereTopology {
    }
 }

-impl HelmCommand for K8sAnywhereTopology {}
+impl HelmCommand for K8sAnywhereTopology {
+    fn get_helm_command(&self) -> Command {
+        let mut cmd = Command::new("helm");
+        if let Some(k) = &self.config.kubeconfig {
+            cmd.args(["--kubeconfig", k]);
+        }
+
+        if let Some(c) = &self.config.k8s_context {
+            cmd.args(["--kube-context", c]);
+        }
+
+        info!("Using helm command {cmd:?}");
+        cmd
+    }
+}

 #[async_trait]
 impl TenantManager for K8sAnywhereTopology {
@@ -1108,7 +790,7 @@ impl TenantManager for K8sAnywhereTopology {
 #[async_trait]
 impl Ingress for K8sAnywhereTopology {
    async fn get_domain(&self, service: &str) -> Result<String, PreparationError> {
-        use log::{debug, trace, warn};
+        use log::{trace, warn};

        let client = self.k8s_client().await?;

--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`pub const PRIVILEGED_POD_IMAGE: &str = "hub.nationtech.io/redhat/ubi10:latest";`