Compare commits
108 Commits
fix/monito
...
feat/broca
| Author | SHA1 | Date | |
|---|---|---|---|
| a646f1f4d0 | |||
| 2728fc8989 | |||
| 8c8baaf9cc | |||
| a1c9bfeabd | |||
| d8dab12834 | |||
| 7422534018 | |||
| b67275662d | |||
| 6237e1d877 | |||
| 88e6990051 | |||
| 8e9f8ce405 | |||
| d87aa3c7e9 | |||
| 90ec2b524a | |||
| 5572f98d5f | |||
| 8024e0d5c3 | |||
| 238e7da175 | |||
| bf84bffd57 | |||
| d4613e42d3 | |||
| 6a57361356 | |||
| d0d4f15122 | |||
| 93b83b8161 | |||
| 6ca8663422 | |||
| f6ce0c6d4f | |||
| 8a1eca21f7 | |||
| 9d2308eca6 | |||
| ccc26e07eb | |||
| 9a67bcc96f | |||
| a377fc1404 | |||
| c9977fee12 | |||
| 64bf585e07 | |||
| 44e2c45435 | |||
| cdccbc8939 | |||
| 9830971d05 | |||
| e1183ef6de | |||
| 444fea81b8 | |||
| 907ae04195 | |||
| 64582caa64 | |||
| f5736fcc37 | |||
| 7a1e84fb68 | |||
| 8499f4d1b7 | |||
| 231d9b878e | |||
| ee2dade0be | |||
| aa07f4c8ad | |||
| 77bb138497 | |||
| a16879b1b6 | |||
| f57e6f5957 | |||
| 7605d05de3 | |||
| b244127843 | |||
| 67c3265286 | |||
| d10598d01e | |||
| 61ba7257d0 | |||
| 8798110bf3 | |||
| 1508d431c0 | |||
| caf6f0c67b | |||
| b0e9594d92 | |||
| 2a7fa466cc | |||
| f463cd1e94 | |||
| e1da7949ec | |||
| d0a1a73710 | |||
| bc2b328296 | |||
| a93896707f | |||
| 0e9b23a320 | |||
| f532ba2b40 | |||
| fafca31798 | |||
| 5412c34957 | |||
| 787cc8feab | |||
| ce041f495b | |||
| bfb86f63ce | |||
| 55de206523 | |||
| 64893a84f5 | |||
| f941672662 | |||
| a98113dd40 | |||
| 5db1a31d33 | |||
| f5aac67af8 | |||
| d7e5bf11d5 | |||
| 2e1f1b8447 | |||
| 2b157ad7fd | |||
| a0c0905c3b | |||
| d920de34cf | |||
| 4276b9137b | |||
| 6ab88ab8d9 | |||
| fe52f69473 | |||
| d8338ad12c | |||
| ac9fedf853 | |||
| fd3705e382 | |||
| 4840c7fdc2 | |||
| 20172a7801 | |||
| 6bb33c5845 | |||
| d9357adad3 | |||
| a25ca86bdf | |||
| 646c5e723e | |||
| 69c382e8c6 | |||
| dca764395d | |||
| 53d0704a35 | |||
| 2738985edb | |||
| d9a21bf94b | |||
| 8f8bd34168 | |||
| b5e971b3b6 | |||
| a1c0e0e246 | |||
| d084cee8d5 | |||
| 63ef1c0ea7 | |||
| de49e9ebcc | |||
| d8ab9d52a4 | |||
| 2cb7aeefc0 | |||
| 16016febcf | |||
| e709de531d | |||
| 6ab0f3a6ab | |||
| 724ab0b888 | |||
| 8b6ce8d069 |
@@ -15,4 +15,4 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run check script
|
||||
run: bash check.sh
|
||||
run: bash build/check.sh
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -29,3 +29,6 @@ Cargo.lock
|
||||
|
||||
# Useful to create ignore folders for temp files and notes
|
||||
ignore
|
||||
|
||||
# Generated book
|
||||
book
|
||||
|
||||
12
.gitmodules
vendored
12
.gitmodules
vendored
@@ -1,3 +1,15 @@
|
||||
[submodule "examples/try_rust_webapp/tryrust.org"]
|
||||
path = examples/try_rust_webapp/tryrust.org
|
||||
url = https://github.com/rust-dd/tryrust.org.git
|
||||
[submodule "/home/jeangab/work/nationtech/harmony2/opnsense-codegen/vendor/core"]
|
||||
path = /home/jeangab/work/nationtech/harmony2/opnsense-codegen/vendor/core
|
||||
url = https://github.com/opnsense/core.git
|
||||
[submodule "/home/jeangab/work/nationtech/harmony2/opnsense-codegen/vendor/plugins"]
|
||||
path = /home/jeangab/work/nationtech/harmony2/opnsense-codegen/vendor/plugins
|
||||
url = https://github.com/opnsense/plugins.git
|
||||
[submodule "opnsense-codegen/vendor/core"]
|
||||
path = opnsense-codegen/vendor/core
|
||||
url = https://github.com/opnsense/core.git
|
||||
[submodule "opnsense-codegen/vendor/plugins"]
|
||||
path = opnsense-codegen/vendor/plugins
|
||||
url = https://github.com/opnsense/plugins.git
|
||||
|
||||
3256
Cargo.lock
generated
3256
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
17
Cargo.toml
17
Cargo.toml
@@ -1,8 +1,8 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"private_repos/*",
|
||||
"examples/*",
|
||||
"private_repos/*",
|
||||
"harmony",
|
||||
"harmony_types",
|
||||
"harmony_macros",
|
||||
@@ -16,10 +16,16 @@ members = [
|
||||
"harmony_inventory_agent",
|
||||
"harmony_secret_derive",
|
||||
"harmony_secret",
|
||||
"adr/agent_discovery/mdns",
|
||||
"examples/kvm_okd_ha_cluster",
|
||||
"examples/example_linux_vm",
|
||||
"harmony_config_derive",
|
||||
"harmony_config",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy",
|
||||
"harmony_node_readiness",
|
||||
"harmony-k8s",
|
||||
"harmony_assets", "opnsense-codegen", "opnsense-api",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -34,10 +40,13 @@ derive-new = "0.7"
|
||||
async-trait = "0.1"
|
||||
tokio = { version = "1.40", features = [
|
||||
"io-std",
|
||||
"io-util",
|
||||
"fs",
|
||||
"macros",
|
||||
"rt-multi-thread",
|
||||
] }
|
||||
tokio-retry = "0.3.0"
|
||||
tokio-util = "0.7.15"
|
||||
cidr = { features = ["serde"], version = "0.2" }
|
||||
russh = "0.45"
|
||||
russh-keys = "0.45"
|
||||
@@ -68,6 +77,7 @@ base64 = "0.22.1"
|
||||
tar = "0.4.44"
|
||||
lazy_static = "1.5.0"
|
||||
directories = "6.0.0"
|
||||
futures-util = "0.3"
|
||||
thiserror = "2.0.14"
|
||||
serde = { version = "1.0.209", features = ["derive", "rc"] }
|
||||
serde_json = "1.0.127"
|
||||
@@ -81,3 +91,6 @@ reqwest = { version = "0.12", features = [
|
||||
"json",
|
||||
], default-features = false }
|
||||
assertor = "0.0.4"
|
||||
tokio-test = "0.4"
|
||||
anyhow = "1.0"
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
|
||||
272
README.md
272
README.md
@@ -1,101 +1,121 @@
|
||||
# Harmony
|
||||
|
||||
Open-source infrastructure orchestration that treats your platform like first-class code.
|
||||
**Infrastructure orchestration that treats your platform like first-class code.**
|
||||
|
||||
In other words, Harmony is a **next-generation platform engineering framework**.
|
||||
Harmony is an open-source framework that brings the rigor of software engineering to infrastructure management. Write Rust code to define what you want, and Harmony handles the rest — from local development to production clusters.
|
||||
|
||||
_By [NationTech](https://nationtech.io)_
|
||||
|
||||
[](https://git.nationtech.io/nationtech/harmony)
|
||||
[](https://git.nationtech.io/NationTech/harmony)
|
||||
[](LICENSE)
|
||||
|
||||
### Unify
|
||||
---
|
||||
|
||||
- **Project Scaffolding**
|
||||
- **Infrastructure Provisioning**
|
||||
- **Application Deployment**
|
||||
- **Day-2 operations**
|
||||
## The Problem Harmony Solves
|
||||
|
||||
All in **one strongly-typed Rust codebase**.
|
||||
Modern infrastructure is messy. Your Kubernetes cluster needs monitoring. Your bare-metal servers need provisioning. Your applications need deployments. Each comes with its own tooling, its own configuration format, and its own failure modes.
|
||||
|
||||
### Deploy anywhere
|
||||
**What if you could describe your entire platform in one consistent language?**
|
||||
|
||||
From a **developer laptop** to a **global production cluster**, a single **source of truth** drives the **full software lifecycle.**
|
||||
That's Harmony. It unifies project scaffolding, infrastructure provisioning, application deployment, and day-2 operations into a single strongly-typed Rust codebase.
|
||||
|
||||
## The Harmony Philosophy
|
||||
---
|
||||
|
||||
Infrastructure is essential, but it shouldn’t be your core business. Harmony is built on three guiding principles that make modern platforms reliable, repeatable, and easy to reason about.
|
||||
## Three Principles That Make the Difference
|
||||
|
||||
| Principle | What it means for you |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| **Infrastructure as Resilient Code** | Replace sprawling YAML and bash scripts with type-safe Rust. Test, refactor, and version your platform just like application code. |
|
||||
| **Prove It Works — Before You Deploy** | Harmony uses the compiler to verify that your application’s needs match the target environment’s capabilities at **compile-time**, eliminating an entire class of runtime outages. |
|
||||
| **One Unified Model** | Software and infrastructure are a single system. Harmony models them together, enabling deep automation—from bare-metal servers to Kubernetes workloads—with zero context switching. |
|
||||
| Principle | What It Means |
|
||||
|-----------|---------------|
|
||||
| **Infrastructure as Resilient Code** | Stop fighting with YAML and bash. Write type-safe Rust that you can test, version, and refactor like any other code. |
|
||||
| **Prove It Works Before You Deploy** | Harmony verifies at _compile time_ that your application can actually run on your target infrastructure. No more "the config looks right but it doesn't work" surprises. |
|
||||
| **One Unified Model** | Software and infrastructure are one system. Deploy from laptop to production cluster without switching contexts or tools. |
|
||||
|
||||
These principles surface as simple, ergonomic Rust APIs that let teams focus on their product while trusting the platform underneath.
|
||||
---
|
||||
|
||||
## Where to Start
|
||||
## How It Works: The Core Concepts
|
||||
|
||||
We have a comprehensive set of documentation right here in the repository.
|
||||
Harmony is built around three concepts that work together:
|
||||
|
||||
| I want to... | Start Here |
|
||||
| ----------------- | ------------------------------------------------------------------ |
|
||||
| Get Started | [Getting Started Guide](./docs/guides/getting-started.md) |
|
||||
| See an Example | [Use Case: Deploy a Rust Web App](./docs/use-cases/rust-webapp.md) |
|
||||
| Explore | [Documentation Hub](./docs/README.md) |
|
||||
| See Core Concepts | [Core Concepts Explained](./docs/concepts.md) |
|
||||
### Score — "What You Want"
|
||||
|
||||
## Quick Look: Deploy a Rust Webapp
|
||||
A `Score` is a declarative description of desired state. Think of it as a "recipe" that says _what_ you want without specifying _how_ to get there.
|
||||
|
||||
The snippet below spins up a complete **production-grade Rust + Leptos Webapp** with monitoring. Swap it for your own scores to deploy anything from microservices to machine-learning pipelines.
|
||||
```rust
|
||||
// "I want a PostgreSQL cluster running with default settings"
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
### Topology — "Where It Goes"
|
||||
|
||||
A `Topology` represents your infrastructure environment and its capabilities. It answers the question: "What can this environment actually do?"
|
||||
|
||||
```rust
|
||||
// Deploy to a local K3D cluster, or any Kubernetes cluster via environment variables
|
||||
K8sAnywhereTopology::from_env()
|
||||
```
|
||||
|
||||
### Interpret — "How It Happens"
|
||||
|
||||
An `Interpret` is the execution logic that connects your `Score` to your `Topology`. It translates "what you want" into "what the infrastructure does."
|
||||
|
||||
**The Compile-Time Check:** Before your code ever runs, Harmony verifies that your `Score` is compatible with your `Topology`. If your application needs a feature your infrastructure doesn't provide, you get a compile error — not a runtime failure.
|
||||
|
||||
---
|
||||
|
||||
## What You Can Deploy
|
||||
|
||||
Harmony ships with ready-made Scores for:
|
||||
|
||||
**Data Services**
|
||||
- PostgreSQL clusters (via CloudNativePG operator)
|
||||
- Multi-site PostgreSQL with failover
|
||||
|
||||
**Kubernetes**
|
||||
- Namespaces, Deployments, Ingress
|
||||
- Helm charts
|
||||
- cert-manager for TLS
|
||||
- Monitoring (Prometheus, alerting, ntfy)
|
||||
|
||||
**Bare Metal / Infrastructure**
|
||||
- OKD clusters from scratch
|
||||
- OPNsense firewalls
|
||||
- Network services (DNS, DHCP, TFTP)
|
||||
- Brocade switch configuration
|
||||
|
||||
**And more:** Application deployment, tenant management, load balancing, and more.
|
||||
|
||||
---
|
||||
|
||||
## Quick Start: Deploy a PostgreSQL Cluster
|
||||
|
||||
This example provisions a local Kubernetes cluster (K3D) and deploys a PostgreSQL cluster on it — no external infrastructure required.
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{
|
||||
application::{
|
||||
ApplicationScore, RustWebFramework, RustWebapp,
|
||||
features::{PackagingDeployment, rhob_monitoring::Monitoring},
|
||||
},
|
||||
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
},
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
use harmony_macros::hurl;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let application = Arc::new(RustWebapp {
|
||||
name: "harmony-example-leptos".to_string(),
|
||||
project_root: PathBuf::from(".."), // <== Your project root, usually .. if you use the standard `/harmony` folder
|
||||
framework: Some(RustWebFramework::Leptos),
|
||||
service_port: 8080,
|
||||
});
|
||||
|
||||
// Define your Application deployment and the features you want
|
||||
let app = ApplicationScore {
|
||||
features: vec![
|
||||
Box::new(PackagingDeployment {
|
||||
application: application.clone(),
|
||||
}),
|
||||
Box::new(Monitoring {
|
||||
application: application.clone(),
|
||||
alert_receiver: vec![
|
||||
Box::new(DiscordWebhook {
|
||||
name: "test-discord".to_string(),
|
||||
url: hurl!("https://discord.doesnt.exist.com"), // <== Get your discord webhook url
|
||||
}),
|
||||
],
|
||||
}),
|
||||
],
|
||||
application,
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned local k3d by default or connect to any kubernetes cluster
|
||||
vec![Box::new(app)],
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
@@ -103,40 +123,128 @@ async fn main() {
|
||||
}
|
||||
```
|
||||
|
||||
To run this:
|
||||
### What this actually does
|
||||
|
||||
- Clone the repository: `git clone https://git.nationtech.io/nationtech/harmony`
|
||||
- Install dependencies: `cargo build --release`
|
||||
- Run the example: `cargo run --example try_rust_webapp`
|
||||
When you compile and run this program:
|
||||
|
||||
1. **Compiles** the Harmony Score into an executable
|
||||
2. **Connects** to `K8sAnywhereTopology` — which auto-provisions a local K3D cluster if none exists
|
||||
3. **Installs** the CloudNativePG operator into the cluster (one-time setup)
|
||||
4. **Creates** a PostgreSQL cluster with 1 instance and 1 GiB of storage
|
||||
5. **Exposes** the PostgreSQL instance as a Kubernetes Service
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- [Rust](https://rust-lang.org/tools/install) (edition 2024)
|
||||
- [Docker](https://docs.docker.com/get-docker/) (for the local K3D cluster)
|
||||
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (optional, for inspecting the cluster)
|
||||
|
||||
### Run it
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://git.nationtech.io/nationtech/harmony
|
||||
cd harmony
|
||||
|
||||
# Build the project
|
||||
cargo build --release
|
||||
|
||||
# Run the example
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
Harmony will print its progress as it sets up the cluster and deploys PostgreSQL. When complete, you can inspect the deployment:
|
||||
|
||||
```bash
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
kubectl get secret -n harmony-postgres-example harmony-postgres-example-db-user -o jsonpath='{.data.password}' | base64 -d
|
||||
```
|
||||
|
||||
To connect to the database, forward the port:
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
psql -h localhost -p 5432 -U postgres
|
||||
```
|
||||
|
||||
To clean up, delete the K3D cluster:
|
||||
```bash
|
||||
k3d cluster delete harmony-postgres-example
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables
|
||||
|
||||
`K8sAnywhereTopology::from_env()` reads the following environment variables to determine where and how to connect:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `KUBECONFIG` | `~/.kube/config` | Path to your kubeconfig file |
|
||||
| `HARMONY_AUTOINSTALL` | `true` | Auto-provision a local K3D cluster if none found |
|
||||
| `HARMONY_USE_LOCAL_K3D` | `true` | Always prefer local K3D over remote clusters |
|
||||
| `HARMONY_PROFILE` | `dev` | Deployment profile: `dev`, `staging`, or `prod` |
|
||||
| `HARMONY_K8S_CONTEXT` | _none_ | Use a specific kubeconfig context |
|
||||
| `HARMONY_PUBLIC_DOMAIN` | _none_ | Public domain for ingress endpoints |
|
||||
|
||||
To connect to an existing Kubernetes cluster instead of provisioning K3D:
|
||||
|
||||
```bash
|
||||
# Point to your kubeconfig
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
# Then run
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Documentation
|
||||
|
||||
All documentation is in the `/docs` directory.
|
||||
| I want to... | Start here |
|
||||
|--------------|------------|
|
||||
| Understand the core concepts | [Core Concepts](./docs/concepts.md) |
|
||||
| Deploy my first application | [Getting Started Guide](./docs/guides/getting-started.md) |
|
||||
| Explore available components | [Scores Catalog](./docs/catalogs/scores.md) · [Topologies Catalog](./docs/catalogs/topologies.md) |
|
||||
| See a complete bare-metal deployment | [OKD on Bare Metal](./docs/use-cases/okd-on-bare-metal.md) |
|
||||
| Build my own Score or Topology | [Developer Guide](./docs/guides/developer-guide.md) |
|
||||
|
||||
- [Documentation Hub](./docs/README.md): The main entry point for all documentation.
|
||||
- [Core Concepts](./docs/concepts.md): A detailed look at Score, Topology, Capability, Inventory, and Interpret.
|
||||
- [Component Catalogs](./docs/catalogs/README.md): Discover all available Scores, Topologies, and Capabilities.
|
||||
- [Developer Guide](./docs/guides/developer-guide.md): Learn how to write your own Scores and Topologies.
|
||||
---
|
||||
|
||||
## Architectural Decision Records
|
||||
## Why Rust?
|
||||
|
||||
- [ADR-001 · Why Rust](adr/001-rust.md)
|
||||
- [ADR-003 · Infrastructure Abstractions](adr/003-infrastructure-abstractions.md)
|
||||
- [ADR-006 · Secret Management](adr/006-secret-management.md)
|
||||
- [ADR-011 · Multi-Tenant Cluster](adr/011-multi-tenant-cluster.md)
|
||||
We chose Rust for the same reason you might: **reliability through type safety**.
|
||||
|
||||
## Contribute
|
||||
Infrastructure code runs in production. It needs to be correct. Rust's ownership model and type system let us build a framework where:
|
||||
|
||||
Discussions and roadmap live in [Issues](https://git.nationtech.io/nationtech/harmony/-/issues). PRs, ideas, and feedback are welcome!
|
||||
- Invalid configurations fail at compile time, not at 3 AM
|
||||
- Refactoring infrastructure is as safe as refactoring application code
|
||||
- The compiler verifies that your platform can actually fulfill your requirements
|
||||
|
||||
See [ADR-001 · Why Rust](./adr/001-rust.md) for our full rationale.
|
||||
|
||||
---
|
||||
|
||||
## Architecture Decisions
|
||||
|
||||
Harmony's design is documented through Architecture Decision Records (ADRs):
|
||||
|
||||
- [ADR-001 · Why Rust](./adr/001-rust.md)
|
||||
- [ADR-003 · Infrastructure Abstractions](./adr/003-infrastructure-abstractions.md)
|
||||
- [ADR-006 · Secret Management](./adr/006-secret-management.md)
|
||||
- [ADR-011 · Multi-Tenant Cluster](./adr/011-multi-tenant-cluster.md)
|
||||
|
||||
---
|
||||
|
||||
## License
|
||||
|
||||
Harmony is released under the **GNU AGPL v3**.
|
||||
|
||||
> We choose a strong copyleft license to ensure the project—and every improvement to it—remains open and benefits the entire community. Fork it, enhance it, even out-innovate us; just keep it open.
|
||||
> We choose a strong copyleft license to ensure the project—and every improvement to it—remains open and benefits the entire community.
|
||||
|
||||
See [LICENSE](LICENSE) for the full text.
|
||||
|
||||
---
|
||||
|
||||
_Made with ❤️ & 🦀 by the NationTech and the Harmony community_
|
||||
_Made with ❤️ & 🦀 by NationTech and the Harmony community_
|
||||
|
||||
29
ROADMAP.md
Normal file
29
ROADMAP.md
Normal file
@@ -0,0 +1,29 @@
|
||||
# Harmony Roadmap
|
||||
|
||||
Six phases to take Harmony from working prototype to production-ready open-source project.
|
||||
|
||||
| # | Phase | Status | Depends On | Detail |
|
||||
|---|-------|--------|------------|--------|
|
||||
| 1 | [Harden `harmony_config`](ROADMAP/01-config-crate.md) | Not started | — | Test every source, add SQLite backend, wire Zitadel + OpenBao, validate zero-setup UX |
|
||||
| 2 | [Migrate to `harmony_config`](ROADMAP/02-refactor-harmony-config.md) | Not started | 1 | Replace all 19 `SecretManager` call sites, deprecate direct `harmony_secret` usage |
|
||||
| 3 | [Complete `harmony_assets`](ROADMAP/03-assets-crate.md) | Not started | 1, 2 | Test, refactor k3d and OKD to use it, implement `Url::Url`, remove LFS |
|
||||
| 4 | [Publish to GitHub](ROADMAP/04-publish-github.md) | Not started | 3 | Clean history, set up GitHub as community hub, CI on self-hosted runners |
|
||||
| 5 | [E2E tests: PostgreSQL & RustFS](ROADMAP/05-e2e-tests-simple.md) | Not started | 1 | k3d-based test harness, two passing E2E tests, CI job |
|
||||
| 6 | [E2E tests: OKD HA on KVM](ROADMAP/06-e2e-tests-kvm.md) | Not started | 5 | KVM test infrastructure, full OKD installation test, nightly CI |
|
||||
|
||||
## Current State (as of branch `feature/kvm-module`)
|
||||
|
||||
- `harmony_config` crate exists with `EnvSource`, `LocalFileSource`, `PromptSource`, `StoreSource`. 12 unit tests. **Zero consumers** in workspace — everything still uses `harmony_secret::SecretManager` directly (19 call sites).
|
||||
- `harmony_assets` crate exists with `Asset`, `LocalCache`, `LocalStore`, `S3Store`. **No tests. Zero consumers.** The `k3d` crate has its own `DownloadableAsset` with identical functionality and full test coverage.
|
||||
- `harmony_secret` has `LocalFileSecretStore`, `OpenbaoSecretStore` (token/userpass only), `InfisicalSecretStore`. Works but no Zitadel OIDC integration.
|
||||
- KVM module exists on this branch with `KvmExecutor`, VM lifecycle, ISO download, two examples (`example_linux_vm`, `kvm_okd_ha_cluster`).
|
||||
- RustFS module exists on `feat/rustfs` branch (2 commits ahead of master).
|
||||
- 39 example crates, **zero E2E tests**. Unit tests pass across workspace (~240 tests).
|
||||
- CI runs `cargo check`, `fmt`, `clippy`, `test` on Gitea. No E2E job.
|
||||
|
||||
## Guiding Principles
|
||||
|
||||
- **Zero-setup first**: A new user clones, runs `cargo run`, gets prompted for config, values persist to local SQLite. No env vars, no external services required.
|
||||
- **Progressive disclosure**: Local SQLite → OpenBao → Zitadel SSO. Each layer is opt-in.
|
||||
- **Test what ships**: Every example that works should have an E2E test proving it works.
|
||||
- **Community over infrastructure**: GitHub for engagement, self-hosted runners for CI.
|
||||
623
ROADMAP/01-config-crate.md
Normal file
623
ROADMAP/01-config-crate.md
Normal file
@@ -0,0 +1,623 @@
|
||||
# Phase 1: Harden `harmony_config`, Validate UX, Zero-Setup Starting Point
|
||||
|
||||
## Goal
|
||||
|
||||
Make `harmony_config` production-ready with a seamless first-run experience: clone, run, get prompted, values persist locally. Then progressively add team-scale backends (OpenBao, Zitadel SSO) without changing any calling code.
|
||||
|
||||
## Current State
|
||||
|
||||
`harmony_config` now has:
|
||||
|
||||
- `Config` trait + `#[derive(Config)]` macro
|
||||
- `ConfigManager` with ordered source chain
|
||||
- Five `ConfigSource` implementations:
|
||||
- `EnvSource` — reads `HARMONY_CONFIG_{KEY}` env vars
|
||||
- `LocalFileSource` — reads/writes `{key}.json` files from a directory
|
||||
- `SqliteSource` — **NEW** reads/writes to SQLite database
|
||||
- `PromptSource` — returns `None` / no-op on set (placeholder for TUI integration)
|
||||
- `StoreSource<S: SecretStore>` — wraps any `harmony_secret::SecretStore` backend
|
||||
- 26 unit tests (mock source, env, local file, sqlite, prompt, integration, store graceful fallback)
|
||||
- Global `CONFIG_MANAGER` static with `init()`, `get()`, `get_or_prompt()`, `set()`
|
||||
- Two examples: `basic` and `prompting` in `harmony_config/examples/`
|
||||
- **Zero workspace consumers** — nothing calls `harmony_config` yet
|
||||
|
||||
## Tasks
|
||||
|
||||
### 1.1 Add `SqliteSource` as the default zero-setup backend ✅
|
||||
|
||||
**Status**: Implemented
|
||||
|
||||
**Implementation Details**:
|
||||
|
||||
- Database location: `~/.local/share/harmony/config/config.db` (directory is auto-created)
|
||||
- Schema: `config(key TEXT PRIMARY KEY, value TEXT NOT NULL, updated_at TEXT NOT NULL DEFAULT (datetime('now')))`
|
||||
- Uses `sqlx` with SQLite runtime
|
||||
- `SqliteSource::open(path)` - opens/creates database at given path
|
||||
- `SqliteSource::default()` - uses default Harmony data directory
|
||||
|
||||
**Files**:
|
||||
- `harmony_config/src/source/sqlite.rs` - new file
|
||||
- `harmony_config/Cargo.toml` - added `sqlx = { workspace = true, features = ["runtime-tokio", "sqlite"] }`
|
||||
- `Cargo.toml` - added `anyhow = "1.0"` to workspace dependencies
|
||||
|
||||
**Tests** (all passing):
|
||||
- `test_sqlite_set_and_get` — round-trip a `TestConfig` struct
|
||||
- `test_sqlite_get_returns_none_when_missing` — key not in DB
|
||||
- `test_sqlite_overwrites_on_set` — set twice, get returns latest
|
||||
- `test_sqlite_concurrent_access` — two tasks writing different keys simultaneously
|
||||
|
||||
### 1.1.1 Add Config example to show exact DX and confirm functionality ✅
|
||||
|
||||
**Status**: Implemented
|
||||
|
||||
**Examples created**:
|
||||
|
||||
1. `harmony_config/examples/basic.rs` - demonstrates:
|
||||
- Zero-setup SQLite backend (auto-creates directory)
|
||||
- Using the `#[derive(Config)]` macro
|
||||
- Environment variable override (`HARMONY_CONFIG_TestConfig` overrides SQLite)
|
||||
- Direct set/get operations
|
||||
- Persistence verification
|
||||
|
||||
2. `harmony_config/examples/prompting.rs` - demonstrates:
|
||||
- Config with no defaults (requires user input via `inquire`)
|
||||
- `get()` flow: env > sqlite > prompt fallback
|
||||
- `get_or_prompt()` for interactive configuration
|
||||
- Full resolution chain
|
||||
- Persistence of prompted values
|
||||
|
||||
### 1.2 Make `PromptSource` functional ✅
|
||||
|
||||
**Status**: Implemented with design improvement
|
||||
|
||||
**Key Finding - Bug Fixed During Implementation**:
|
||||
|
||||
The original design had a critical bug in `get_or_prompt()`:
|
||||
```rust
|
||||
// OLD (BUGGY) - breaks on first source where set() returns Ok(())
|
||||
for source in &self.sources {
|
||||
if source.set(T::KEY, &value).await.is_ok() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Since `EnvSource.set()` returns `Ok(())` (successfully sets env var), the loop would break immediately and never write to `SqliteSource`. Prompted values were never persisted!
|
||||
|
||||
**Solution - Added `should_persist()` method to ConfigSource trait**:
|
||||
|
||||
```rust
|
||||
#[async_trait]
|
||||
pub trait ConfigSource: Send + Sync {
|
||||
async fn get(&self, key: &str) -> Result<Option<serde_json::Value>, ConfigError>;
|
||||
async fn set(&self, key: &str, value: &serde_json::Value) -> Result<(), ConfigError>;
|
||||
fn should_persist(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- `EnvSource::should_persist()` returns `false` - shouldn't persist prompted values to env vars
|
||||
- `PromptSource::should_persist()` returns `false` - doesn't persist anyway
|
||||
- `get_or_prompt()` now skips sources where `should_persist()` is `false`
|
||||
|
||||
**Updated `get_or_prompt()`**:
|
||||
```rust
|
||||
for source in &self.sources {
|
||||
if !source.should_persist() {
|
||||
continue;
|
||||
}
|
||||
if source.set(T::KEY, &value).await.is_ok() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Tests**:
|
||||
- `test_prompt_source_always_returns_none`
|
||||
- `test_prompt_source_set_is_noop`
|
||||
- `test_prompt_source_does_not_persist`
|
||||
- `test_full_chain_with_prompt_source_falls_through_to_prompt`
|
||||
|
||||
### 1.3 Integration test: full resolution chain ✅
|
||||
|
||||
**Status**: Implemented
|
||||
|
||||
**Tests**:
|
||||
- `test_full_resolution_chain_sqlite_fallback` — env not set, sqlite has value, get() returns sqlite
|
||||
- `test_full_resolution_chain_env_overrides_sqlite` — env set, sqlite has value, get() returns env
|
||||
- `test_branch_switching_scenario_deserialization_error` — old struct shape in sqlite returns Deserialization error
|
||||
|
||||
### 1.4 Validate Zitadel + OpenBao integration path ⏳
|
||||
|
||||
**Status**: Planning phase - detailed execution plan below
|
||||
|
||||
**Background**: ADR 020-1 documents the target architecture for Zitadel OIDC + OpenBao integration. This task validates the full chain by deploying Zitadel and OpenBao on a local k3d cluster and demonstrating an end-to-end example.
|
||||
|
||||
**Architecture Overview**:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ Harmony CLI / App │
|
||||
│ │
|
||||
│ ConfigManager: │
|
||||
│ 1. EnvSource ← HARMONY_CONFIG_* env vars (highest priority) │
|
||||
│ 2. SqliteSource ← ~/.local/share/harmony/config/config.db │
|
||||
│ 3. StoreSource ← OpenBao (team-scale, via Zitadel OIDC) │
|
||||
│ │
|
||||
│ When StoreSource fails (OpenBao unreachable): │
|
||||
│ → returns Ok(None), chain falls through to SqliteSource │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌──────────────────┐ ┌──────────────────┐
|
||||
│ Zitadel │ │ OpenBao │
|
||||
│ (IdP + OIDC) │ │ (Secret Store) │
|
||||
│ │ │ │
|
||||
│ Device Auth │────JWT──▶│ JWT Auth │
|
||||
│ Flow (RFC 8628)│ │ Method │
|
||||
└──────────────────┘ └──────────────────┘
|
||||
```
|
||||
|
||||
**Prerequisites**:
|
||||
- Docker running (for k3d)
|
||||
- Rust toolchain (edition 2024)
|
||||
- Network access to download Helm charts
|
||||
- `kubectl` (installed automatically with k3d, or pre-installed)
|
||||
|
||||
**Step-by-Step Execution Plan**:
|
||||
|
||||
#### Step 1: Create k3d cluster for local development
|
||||
|
||||
When you run `cargo run -p example-zitadel` (or any example using `K8sAnywhereTopology::from_env()`), Harmony automatically provisions a k3d cluster if one does not exist. By default:
|
||||
|
||||
- `use_local_k3d = true` (env: `HARMONY_USE_LOCAL_K3D`, default `true`)
|
||||
- `autoinstall = true` (env: `HARMONY_AUTOINSTALL`, default `true`)
|
||||
- Cluster name: **`harmony`** (hardcoded in `K3DInstallationScore::default()`)
|
||||
- k3d binary is downloaded to `~/.local/share/harmony/k3d/`
|
||||
- Kubeconfig is merged into `~/.kube/config`, context set to `k3d-harmony`
|
||||
|
||||
No manual `k3d cluster create` is needed. If you want to create the cluster manually first:
|
||||
|
||||
```bash
|
||||
# Install k3d (requires sudo or install to user path)
|
||||
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
|
||||
|
||||
# Create the cluster with the same name Harmony expects
|
||||
k3d cluster create harmony
|
||||
kubectl cluster-info --context k3d-harmony
|
||||
```
|
||||
|
||||
**Validation**: `kubectl get nodes --context k3d-harmony` shows 1 server node (k3d default)
|
||||
|
||||
**Note**: The existing examples use hardcoded external hostnames (e.g., `sso.sto1.nationtech.io`) for ingress. On a local k3d cluster, these hostnames are not routable. For local development you must either:
|
||||
- Use `kubectl port-forward` to access services directly
|
||||
- Configure `/etc/hosts` entries pointing to `127.0.0.1`
|
||||
- Use a k3d loadbalancer with `--port` mappings
|
||||
|
||||
#### Step 2: Deploy Zitadel
|
||||
|
||||
Zitadel requires the topology to implement `Topology + K8sclient + HelmCommand + PostgreSQL`. The `K8sAnywhereTopology` satisfies all four.
|
||||
|
||||
```bash
|
||||
cargo run -p example-zitadel
|
||||
```
|
||||
|
||||
**What happens internally** (see `harmony/src/modules/zitadel/mod.rs`):
|
||||
|
||||
1. Creates `zitadel` namespace via `K8sResourceScore`
|
||||
2. Deploys a CNPG PostgreSQL cluster:
|
||||
- Name: `zitadel-pg`
|
||||
- Instances: **2** (not 1)
|
||||
- Storage: 10Gi
|
||||
- Namespace: `zitadel`
|
||||
3. Resolves the internal DB endpoint (`host:port`) from the CNPG cluster
|
||||
4. Generates a 32-byte alphanumeric masterkey, stores it as Kubernetes Secret `zitadel-masterkey` (idempotent: skips if it already exists)
|
||||
5. Generates a 16-char admin password (guaranteed 1+ uppercase, lowercase, digit, symbol)
|
||||
6. Deploys Zitadel Helm chart (`zitadel/zitadel` from `https://charts.zitadel.com`):
|
||||
- `chart_version: None` -- **uses latest chart version** (not pinned)
|
||||
- No `--wait` flag -- returns before pods are ready
|
||||
- Ingress annotations are **OpenShift-oriented** (`route.openshift.io/termination: edge`, `cert-manager.io/cluster-issuer: letsencrypt-prod`). On k3d these annotations are silently ignored.
|
||||
- Ingress includes TLS config with `secretName: "{host}-tls"`, which requires cert-manager. Without cert-manager, TLS termination does not happen at the ingress level.
|
||||
|
||||
**Key Helm values set by ZitadelScore**:
|
||||
- `zitadel.configmapConfig.ExternalDomain`: the `host` field (e.g., `sso.sto1.nationtech.io`)
|
||||
- `zitadel.configmapConfig.ExternalSecure: true`
|
||||
- `zitadel.configmapConfig.TLS.Enabled: false` (TLS at ingress, not in Zitadel)
|
||||
- Admin user: `UserName: "admin"`, Email: **`admin@zitadel.example.com`** (hardcoded, not derived from host)
|
||||
- Database credentials: injected via `env[].valueFrom.secretKeyRef` from secret `zitadel-pg-superuser` (both user and admin use the same superuser -- there is a TODO to fix this)
|
||||
|
||||
**Expected output**:
|
||||
```
|
||||
===== ZITADEL DEPLOYMENT COMPLETE =====
|
||||
Login URL: https://sso.sto1.nationtech.io
|
||||
Username: admin@zitadel.sso.sto1.nationtech.io
|
||||
Password: <generated 16-char password>
|
||||
```
|
||||
|
||||
**Note on the success message**: The printed username `admin@zitadel.{host}` does not match the actual configured email `admin@zitadel.example.com`. The actual login username in Zitadel is `admin` (the `UserName` field). This discrepancy exists in the current code.
|
||||
|
||||
**Validation on k3d**:
|
||||
```bash
|
||||
# Wait for pods to be ready (Helm returns before readiness)
|
||||
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=zitadel -n zitadel --timeout=300s
|
||||
|
||||
# Port-forward to access Zitadel (ingress won't work without proper DNS/TLS on k3d)
|
||||
kubectl port-forward svc/zitadel -n zitadel 8080:8080
|
||||
|
||||
# Access at http://localhost:8080 (note: ExternalSecure=true may cause redirect issues)
|
||||
```
|
||||
|
||||
**Known issues for k3d deployment**:
|
||||
- `ExternalSecure: true` tells Zitadel to expect HTTPS, but k3d port-forward is HTTP. This may cause redirect loops. Override with: modify the example to set `ExternalSecure: false` for local dev.
|
||||
- The CNPG operator must be installed on the cluster. `K8sAnywhereTopology` handles this via the `PostgreSQL` trait implementation, which deploys the operator first.
|
||||
|
||||
#### Step 3: Deploy OpenBao
|
||||
|
||||
OpenBao requires only `Topology + K8sclient + HelmCommand` (no PostgreSQL dependency).
|
||||
|
||||
```bash
|
||||
cargo run -p example-openbao
|
||||
```
|
||||
|
||||
**What happens internally** (see `harmony/src/modules/openbao/mod.rs`):
|
||||
|
||||
1. `OpenbaoScore` directly delegates to `HelmChartScore.create_interpret()` -- there is no custom `execute()` logic, no namespace creation step, no secret generation
|
||||
2. Deploys OpenBao Helm chart (`openbao/openbao` from `https://openbao.github.io/openbao-helm`):
|
||||
- `chart_version: None` -- **uses latest chart version** (not pinned)
|
||||
- `create_namespace: true` -- the `openbao` namespace is created by Helm
|
||||
- `install_only: false` -- uses `helm upgrade --install`
|
||||
|
||||
**Exact Helm values set by OpenbaoScore**:
|
||||
```yaml
|
||||
global:
|
||||
openshift: true # <-- PROBLEM: hardcoded, see below
|
||||
server:
|
||||
standalone:
|
||||
enabled: true
|
||||
config: |
|
||||
ui = true
|
||||
listener "tcp" {
|
||||
tls_disable = true
|
||||
address = "[::]:8200"
|
||||
cluster_address = "[::]:8201"
|
||||
}
|
||||
storage "file" {
|
||||
path = "/openbao/data"
|
||||
}
|
||||
service:
|
||||
enabled: true
|
||||
ingress:
|
||||
enabled: true
|
||||
hosts:
|
||||
- host: <host field> # e.g., openbao.sebastien.sto1.nationtech.io
|
||||
dataStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null # uses cluster default
|
||||
accessMode: ReadWriteOnce
|
||||
auditStorage:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: null
|
||||
accessMode: ReadWriteOnce
|
||||
ui:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
**Critical issue: `global.openshift: true` is hardcoded.** The OpenBao Helm chart default is `global.openshift: false`. When set to `true`, the chart adjusts security contexts and may create OpenShift Routes instead of standard Kubernetes Ingress resources. **On k3d (vanilla k8s), this will produce resources that may not work correctly.** Before deploying on k3d, this must be overridden.
|
||||
|
||||
**Fix required for k3d**: Either:
|
||||
1. Modify `OpenbaoScore` to accept an `openshift: bool` field (preferred long-term fix)
|
||||
2. Or for this example, create a custom example that passes `values_overrides` with `global.openshift=false`
|
||||
|
||||
**Post-deployment initialization** (manual -- the TODO in `mod.rs` acknowledges this is not automated):
|
||||
|
||||
OpenBao starts in a sealed state. You must initialize and unseal it manually. See https://openbao.org/docs/platform/k8s/helm/run/
|
||||
|
||||
```bash
|
||||
# Initialize OpenBao (generates unseal keys + root token)
|
||||
kubectl exec -n openbao openbao-0 -- bao operator init
|
||||
|
||||
# Save the output! It contains 5 unseal keys and the root token.
|
||||
# Example output:
|
||||
# Unseal Key 1: abc123...
|
||||
# Unseal Key 2: def456...
|
||||
# ...
|
||||
# Initial Root Token: hvs.xxxxx
|
||||
|
||||
# Unseal (requires 3 of 5 keys by default)
|
||||
kubectl exec -n openbao openbao-0 -- bao operator unseal <key1>
|
||||
kubectl exec -n openbao openbao-0 -- bao operator unseal <key2>
|
||||
kubectl exec -n openbao openbao-0 -- bao operator unseal <key3>
|
||||
```
|
||||
|
||||
**Validation**:
|
||||
```bash
|
||||
kubectl exec -n openbao openbao-0 -- bao status
|
||||
# Should show "Sealed: false"
|
||||
```
|
||||
|
||||
**Note**: The ingress has **no TLS configuration** (unlike Zitadel's ingress). Access is HTTP-only unless you configure TLS separately.
|
||||
|
||||
#### Step 4: Configure OpenBao for Harmony
|
||||
|
||||
Two paths are available depending on the authentication method:
|
||||
|
||||
##### Path A: Userpass auth (simpler, for local dev)
|
||||
|
||||
The current `OpenbaoSecretStore` supports **token** and **userpass** authentication. It does NOT yet implement the JWT/OIDC device flow described in ADR 020-1.
|
||||
|
||||
```bash
|
||||
# Port-forward to access OpenBao API
|
||||
kubectl port-forward svc/openbao -n openbao 8200:8200 &
|
||||
|
||||
export BAO_ADDR="http://127.0.0.1:8200"
|
||||
export BAO_TOKEN="<root token from init>"
|
||||
|
||||
# Enable KV v2 secrets engine (default mount "secret")
|
||||
bao secrets enable -path=secret kv-v2
|
||||
|
||||
# Enable userpass auth method
|
||||
bao auth enable userpass
|
||||
|
||||
# Create a user for Harmony
|
||||
bao write auth/userpass/login/harmony password="harmony-dev-password"
|
||||
|
||||
# Create policy granting read/write on harmony/* paths
|
||||
cat <<'EOF' | bao policy write harmony-dev -
|
||||
path "secret/data/harmony/*" {
|
||||
capabilities = ["create", "read", "update", "delete", "list"]
|
||||
}
|
||||
path "secret/metadata/harmony/*" {
|
||||
capabilities = ["list", "read", "delete"]
|
||||
}
|
||||
EOF
|
||||
|
||||
# Create the user with the policy attached
|
||||
bao write auth/userpass/users/harmony \
|
||||
password="harmony-dev-password" \
|
||||
policies="harmony-dev"
|
||||
```
|
||||
|
||||
**Bug in `OpenbaoSecretStore::authenticate_userpass()`**: The `kv_mount` parameter (default `"secret"`) is passed to `vaultrs::auth::userpass::login()` as the auth mount path. This means it calls `POST /v1/auth/secret/login/{username}` instead of the correct `POST /v1/auth/userpass/login/{username}`. **The auth mount and KV mount are conflated into one parameter.**
|
||||
|
||||
**Workaround**: Set `OPENBAO_KV_MOUNT=userpass` so the auth call hits the correct mount path. But then KV operations would use mount `userpass` instead of `secret`, which is wrong.
|
||||
|
||||
**Proper fix needed**: Split `kv_mount` into two separate parameters: one for the KV v2 engine mount (`secret`) and one for the auth mount (`userpass`). This is a bug in `harmony_secret/src/store/openbao.rs:234`.
|
||||
|
||||
**For this example**: Use **token auth** instead of userpass to sidestep the bug:
|
||||
|
||||
```bash
|
||||
# Set env vars for the example
|
||||
export OPENBAO_URL="http://127.0.0.1:8200"
|
||||
export OPENBAO_TOKEN="<root token from init>"
|
||||
export OPENBAO_KV_MOUNT="secret"
|
||||
```
|
||||
|
||||
##### Path B: JWT auth with Zitadel (target architecture, per ADR 020-1)
|
||||
|
||||
This is the production path described in the ADR. It requires the device flow code that is **not yet implemented** in `OpenbaoSecretStore`. The current code only supports token and userpass.
|
||||
|
||||
When implemented, the flow will be:
|
||||
1. Enable JWT auth method in OpenBao
|
||||
2. Configure it to trust Zitadel's OIDC discovery URL
|
||||
3. Create a role that maps Zitadel JWT claims to OpenBao policies
|
||||
|
||||
```bash
|
||||
# Enable JWT auth
|
||||
bao auth enable jwt
|
||||
|
||||
# Configure JWT auth to trust Zitadel
|
||||
bao write auth/jwt/config \
|
||||
oidc_discovery_url="https://<zitadel-host>" \
|
||||
bound_issuer="https://<zitadel-host>"
|
||||
|
||||
# Create role for Harmony developers
|
||||
bao write auth/jwt/role/harmony-developer \
|
||||
role_type="jwt" \
|
||||
bound_audiences="<harmony_client_id>" \
|
||||
user_claim="email" \
|
||||
groups_claim="urn:zitadel:iam:org:project:roles" \
|
||||
policies="harmony-dev" \
|
||||
ttl="4h" \
|
||||
max_ttl="24h" \
|
||||
token_type="service"
|
||||
```
|
||||
|
||||
**Zitadel application setup** (in Zitadel console):
|
||||
1. Create project: `Harmony`
|
||||
2. Add application: `Harmony CLI` (Native app type)
|
||||
3. Enable Device Authorization grant type
|
||||
4. Set scopes: `openid email profile offline_access`
|
||||
5. Note the `client_id`
|
||||
|
||||
This path is deferred until the device flow is implemented in `OpenbaoSecretStore`.
|
||||
|
||||
#### Step 5: Write end-to-end example
|
||||
|
||||
The example uses `StoreSource<OpenbaoSecretStore>` with token auth to avoid the userpass mount bug.
|
||||
|
||||
**Environment variables required** (from `harmony_secret/src/config.rs`):
|
||||
|
||||
| Variable | Required | Default | Notes |
|
||||
|---|---|---|---|
|
||||
| `OPENBAO_URL` | Yes | None | Falls back to `VAULT_ADDR` |
|
||||
| `OPENBAO_TOKEN` | For token auth | None | Root or user token |
|
||||
| `OPENBAO_USERNAME` | For userpass | None | Requires `OPENBAO_PASSWORD` too |
|
||||
| `OPENBAO_PASSWORD` | For userpass | None | |
|
||||
| `OPENBAO_KV_MOUNT` | No | `"secret"` | KV v2 engine mount path. **Also used as userpass auth mount -- this is a bug.** |
|
||||
| `OPENBAO_SKIP_TLS` | No | `false` | Set `"true"` to disable TLS verification |
|
||||
|
||||
**Note**: `OpenbaoSecretStore::new()` is `async` and **requires a running OpenBao** at construction time (it validates the token if using cached auth). If OpenBao is unreachable during construction, the call will fail. The graceful fallback only applies to `StoreSource::get()` calls after construction -- the `ConfigManager` must be built with a live store, or the store must be wrapped in a lazy initialization pattern.
|
||||
|
||||
```rust
|
||||
// harmony_config/examples/openbao_chain.rs
|
||||
use harmony_config::{ConfigManager, EnvSource, SqliteSource, StoreSource};
|
||||
use harmony_secret::OpenbaoSecretStore;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema, PartialEq)]
|
||||
struct AppConfig {
|
||||
host: String,
|
||||
port: u16,
|
||||
}
|
||||
|
||||
impl harmony_config::Config for AppConfig {
|
||||
const KEY: &'static str = "AppConfig";
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
env_logger::init();
|
||||
|
||||
// Build the source chain
|
||||
let env_source: Arc<dyn harmony_config::ConfigSource> = Arc::new(EnvSource);
|
||||
|
||||
let sqlite = Arc::new(
|
||||
SqliteSource::default()
|
||||
.await
|
||||
.expect("Failed to open SQLite"),
|
||||
);
|
||||
|
||||
// OpenBao store -- requires OPENBAO_URL and OPENBAO_TOKEN env vars
|
||||
// Falls back gracefully if OpenBao is unreachable at query time
|
||||
let openbao_url = std::env::var("OPENBAO_URL")
|
||||
.or(std::env::var("VAULT_ADDR"))
|
||||
.ok();
|
||||
|
||||
let sources: Vec<Arc<dyn harmony_config::ConfigSource>> = if let Some(url) = openbao_url {
|
||||
let kv_mount = std::env::var("OPENBAO_KV_MOUNT")
|
||||
.unwrap_or_else(|_| "secret".to_string());
|
||||
let skip_tls = std::env::var("OPENBAO_SKIP_TLS")
|
||||
.map(|v| v == "true")
|
||||
.unwrap_or(false);
|
||||
|
||||
match OpenbaoSecretStore::new(
|
||||
url,
|
||||
kv_mount,
|
||||
skip_tls,
|
||||
std::env::var("OPENBAO_TOKEN").ok(),
|
||||
std::env::var("OPENBAO_USERNAME").ok(),
|
||||
std::env::var("OPENBAO_PASSWORD").ok(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(store) => {
|
||||
let store_source = Arc::new(StoreSource::new("harmony".to_string(), store));
|
||||
vec![env_source, Arc::clone(&sqlite) as _, store_source]
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Warning: OpenBao unavailable ({e}), using local sources only");
|
||||
vec![env_source, sqlite]
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("No OPENBAO_URL set, using local sources only");
|
||||
vec![env_source, sqlite]
|
||||
};
|
||||
|
||||
let manager = ConfigManager::new(sources);
|
||||
|
||||
// Scenario 1: get() with nothing stored -- returns NotFound
|
||||
let result = manager.get::<AppConfig>().await;
|
||||
println!("Get (empty): {:?}", result);
|
||||
|
||||
// Scenario 2: set() then get()
|
||||
let config = AppConfig {
|
||||
host: "production.example.com".to_string(),
|
||||
port: 443,
|
||||
};
|
||||
manager.set(&config).await?;
|
||||
println!("Set: {:?}", config);
|
||||
|
||||
let retrieved = manager.get::<AppConfig>().await?;
|
||||
println!("Get (after set): {:?}", retrieved);
|
||||
assert_eq!(config, retrieved);
|
||||
|
||||
println!("End-to-end chain validated!");
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
**Key behaviors demonstrated**:
|
||||
1. **Graceful construction fallback**: If `OPENBAO_URL` is not set or OpenBao is unreachable at startup, the chain is built without it
|
||||
2. **Graceful query fallback**: `StoreSource::get()` returns `Ok(None)` on any error, so the chain continues to SQLite
|
||||
3. **Environment override**: `HARMONY_CONFIG_AppConfig='{"host":"env-host","port":9090}'` bypasses all backends
|
||||
|
||||
#### Step 6: Validate graceful fallback
|
||||
|
||||
Already validated via unit tests (26 tests pass):
|
||||
|
||||
- `test_store_source_error_falls_through_to_sqlite` -- `StoreSource` with `AlwaysErrorStore` returns connection error, chain falls through to `SqliteSource`
|
||||
- `test_store_source_not_found_falls_through_to_sqlite` -- `StoreSource` returns `NotFound`, chain falls through to `SqliteSource`
|
||||
|
||||
**Code path (FIXED in `harmony_config/src/source/store.rs`)**:
|
||||
```rust
|
||||
// StoreSource::get() -- returns Ok(None) on ANY error, allowing chain to continue
|
||||
match self.store.get_raw(&self.namespace, key).await {
|
||||
Ok(bytes) => { /* deserialize and return */ Ok(Some(value)) }
|
||||
Err(SecretStoreError::NotFound { .. }) => Ok(None),
|
||||
Err(_) => Ok(None), // Connection errors, timeouts, etc.
|
||||
}
|
||||
```
|
||||
|
||||
#### Step 7: Known issues and blockers
|
||||
|
||||
| Issue | Location | Severity | Status |
|
||||
|---|---|---|---|
|
||||
| `global.openshift: true` hardcoded | `harmony/src/modules/openbao/mod.rs:32` | **Blocker for k3d** | ✅ Fixed: Added `openshift: bool` field to `OpenbaoScore` (defaults to `false`) |
|
||||
| `kv_mount` used as auth mount path | `harmony_secret/src/store/openbao.rs:234` | **Bug** | ✅ Fixed: Added separate `auth_mount` parameter; added `OPENBAO_AUTH_MOUNT` env var |
|
||||
| Admin email hardcoded `admin@zitadel.example.com` | `harmony/src/modules/zitadel/mod.rs:314` | Minor | Cosmetic mismatch with success message |
|
||||
| `ExternalSecure: true` hardcoded | `harmony/src/modules/zitadel/mod.rs:306` | **Issue for k3d** | ✅ Fixed: Zitadel now detects Kubernetes distribution and uses appropriate settings (OpenShift = TLS + cert-manager annotations, k3d = plain nginx ingress without TLS) |
|
||||
| No Helm chart version pinning | Both modules | Risk | Non-deterministic deploys |
|
||||
| No `--wait` on Helm install | `harmony/src/modules/helm/chart.rs` | UX | Must manually wait for readiness |
|
||||
| `get_version()`/`get_status()` are `todo!()` | Both modules | Panic risk | Do not call these methods |
|
||||
| JWT/OIDC device flow not implemented | `harmony_secret/src/store/openbao.rs` | **Gap** | ✅ Implemented: `ZitadelOidcAuth` in `harmony_secret/src/store/zitadel.rs` |
|
||||
| `HARMONY_SECRET_NAMESPACE` panics if not set | `harmony_secret/src/config.rs:5` | Runtime panic | Only affects `SecretManager`, not `StoreSource` directly |
|
||||
|
||||
**Remaining work**:
|
||||
- [x] `StoreSource<OpenbaoSecretStore>` integration validates compilation
|
||||
- [x] StoreSource returns `Ok(None)` on connection error (not `Err`)
|
||||
- [x] Graceful fallback tests pass when OpenBao is unreachable (2 new tests)
|
||||
- [x] Fix `global.openshift: true` in `OpenbaoScore` for k3d compatibility
|
||||
- [x] Fix `kv_mount` / auth mount conflation bug in `OpenbaoSecretStore`
|
||||
- [x] Create and test `harmony_config/examples/openbao_chain.rs` against real k3d deployment
|
||||
- [x] Implement JWT/OIDC device flow in `OpenbaoSecretStore` (ADR 020-1) — `ZitadelOidcAuth` implemented and wired into `OpenbaoSecretStore::new()` auth chain
|
||||
- [x] Fix Zitadel distribution detection — Zitadel now uses `k8s_client.get_k8s_distribution()` to detect OpenShift vs k3d and applies appropriate Helm values (TLS + cert-manager for OpenShift, plain nginx for k3d)
|
||||
|
||||
### 1.5 UX validation checklist ⏳
|
||||
|
||||
**Status**: Partially complete - manual verification needed
|
||||
|
||||
- [ ] `cargo run --example postgresql` with no env vars → prompts for nothing
|
||||
- [ ] An example that uses `SecretManager` today (e.g., `brocade_snmp_server`) → when migrated to `harmony_config`, first run prompts, second run reads from SQLite
|
||||
- [ ] Setting `HARMONY_CONFIG_BrocadeSwitchAuth='{"host":"...","user":"...","password":"..."}'` → skips prompt, uses env value
|
||||
- [ ] Deleting `~/.local/share/harmony/config/` directory → re-prompts on next run
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [x] `SqliteSource` implementation with tests
|
||||
- [x] Functional `PromptSource` with `should_persist()` design
|
||||
- [x] Fix `get_or_prompt` to persist to first writable source (via `should_persist()`), not all sources
|
||||
- [x] Integration tests for full resolution chain
|
||||
- [x] Branch-switching deserialization failure test
|
||||
- [x] `StoreSource<OpenbaoSecretStore>` integration validated (compiles, graceful fallback)
|
||||
- [x] ADR for Zitadel OIDC target architecture
|
||||
- [ ] Update docs to reflect final implementation and behavior
|
||||
|
||||
## Key Implementation Notes
|
||||
|
||||
1. **SQLite path**: `~/.local/share/harmony/config/config.db` (not `~/.local/share/harmony/config.db`)
|
||||
|
||||
2. **Auto-create directory**: `SqliteSource::open()` creates parent directories if they don't exist
|
||||
|
||||
3. **Default path**: `SqliteSource::default()` uses `directories::ProjectDirs` to find the correct data directory
|
||||
|
||||
4. **Env var precedence**: Environment variables always take precedence over SQLite in the resolution chain
|
||||
|
||||
5. **Testing**: All tests use `tempfile::NamedTempFile` for temporary database paths, ensuring test isolation
|
||||
|
||||
6. **Graceful fallback**: `StoreSource::get()` returns `Ok(None)` on any error (connection refused, timeout, etc.), allowing the chain to fall through to the next source. This ensures OpenBao unavailability doesn't break the config chain.
|
||||
|
||||
7. **StoreSource errors don't block chain**: When OpenBao is unreachable, `StoreSource::get()` returns `Ok(None)` and the `ConfigManager` continues to the next source (typically `SqliteSource`). This is validated by `test_store_source_error_falls_through_to_sqlite` and `test_store_source_not_found_falls_through_to_sqlite`.
|
||||
112
ROADMAP/02-refactor-harmony-config.md
Normal file
112
ROADMAP/02-refactor-harmony-config.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Phase 2: Migrate Workspace to `harmony_config`
|
||||
|
||||
## Goal
|
||||
|
||||
Replace every direct `harmony_secret::SecretManager` call with `harmony_config` equivalents. After this phase, modules and examples depend only on `harmony_config`. `harmony_secret` becomes an internal implementation detail behind `StoreSource`.
|
||||
|
||||
## Current State
|
||||
|
||||
19 call sites use `SecretManager::get_or_prompt::<T>()` across:
|
||||
|
||||
| Location | Secret Types | Call Sites |
|
||||
|----------|-------------|------------|
|
||||
| `harmony/src/modules/brocade/brocade_snmp.rs` | `BrocadeSnmpAuth`, `BrocadeSwitchAuth` | 2 |
|
||||
| `harmony/src/modules/nats/score_nats_k8s.rs` | `NatsAdmin` | 1 |
|
||||
| `harmony/src/modules/okd/bootstrap_02_bootstrap.rs` | `RedhatSecret`, `SshKeyPair` | 2 |
|
||||
| `harmony/src/modules/application/features/monitoring.rs` | `NtfyAuth` | 1 |
|
||||
| `brocade/examples/main.rs` | `BrocadeSwitchAuth` | 1 |
|
||||
| `examples/okd_installation/src/main.rs` + `topology.rs` | `SshKeyPair`, `BrocadeSwitchAuth`, `OPNSenseFirewallConfig` | 3 |
|
||||
| `examples/okd_pxe/src/main.rs` + `topology.rs` | `SshKeyPair`, `BrocadeSwitchAuth`, `OPNSenseFirewallCredentials` | 3 |
|
||||
| `examples/opnsense/src/main.rs` | `OPNSenseFirewallCredentials` | 1 |
|
||||
| `examples/sttest/src/main.rs` + `topology.rs` | `SshKeyPair`, `OPNSenseFirewallConfig` | 2 |
|
||||
| `examples/opnsense_node_exporter/` | (has dep but unclear usage) | ~1 |
|
||||
| `examples/okd_cluster_alerts/` | (has dep but unclear usage) | ~1 |
|
||||
| `examples/brocade_snmp_server/` | (has dep but unclear usage) | ~1 |
|
||||
|
||||
## Tasks
|
||||
|
||||
### 2.1 Bootstrap `harmony_config` in CLI and TUI entry points
|
||||
|
||||
Add `harmony_config::init()` as the first thing that happens in `harmony_cli::run()` and `harmony_tui::run()`.
|
||||
|
||||
```rust
|
||||
// harmony_cli/src/lib.rs — inside run()
|
||||
pub async fn run<T: Topology + Send + Sync + 'static>(
|
||||
inventory: Inventory,
|
||||
topology: T,
|
||||
scores: Vec<Box<dyn Score<T>>>,
|
||||
args_struct: Option<Args>,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Initialize config system with default source chain
|
||||
let sqlite = Arc::new(SqliteSource::default().await?);
|
||||
let env = Arc::new(EnvSource);
|
||||
harmony_config::init(vec![env, sqlite]).await;
|
||||
|
||||
// ... rest of run()
|
||||
}
|
||||
```
|
||||
|
||||
This replaces the implicit `SecretManager` lazy initialization that currently happens on first `get_or_prompt` call.
|
||||
|
||||
### 2.2 Migrate each secret type from `Secret` to `Config`
|
||||
|
||||
For each secret struct, change:
|
||||
|
||||
```rust
|
||||
// Before
|
||||
use harmony_secret::Secret;
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, InteractiveParse, Secret)]
|
||||
struct BrocadeSwitchAuth { ... }
|
||||
|
||||
// After
|
||||
use harmony_config::Config;
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, InteractiveParse, Config)]
|
||||
struct BrocadeSwitchAuth { ... }
|
||||
```
|
||||
|
||||
At each call site, change:
|
||||
|
||||
```rust
|
||||
// Before
|
||||
let config = SecretManager::get_or_prompt::<BrocadeSwitchAuth>().await.unwrap();
|
||||
|
||||
// After
|
||||
let config = harmony_config::get_or_prompt::<BrocadeSwitchAuth>().await.unwrap();
|
||||
```
|
||||
|
||||
### 2.3 Migration order (low risk to high risk)
|
||||
|
||||
1. **`brocade/examples/main.rs`** — 1 call site, isolated example, easy to test manually
|
||||
2. **`examples/opnsense/src/main.rs`** — 1 call site, isolated
|
||||
3. **`harmony/src/modules/brocade/brocade_snmp.rs`** — 2 call sites, core module but straightforward
|
||||
4. **`harmony/src/modules/nats/score_nats_k8s.rs`** — 1 call site
|
||||
5. **`harmony/src/modules/application/features/monitoring.rs`** — 1 call site
|
||||
6. **`examples/sttest/`** — 2 call sites, has both main.rs and topology.rs patterns
|
||||
7. **`examples/okd_installation/`** — 3 call sites, complex topology setup
|
||||
8. **`examples/okd_pxe/`** — 3 call sites, similar to okd_installation
|
||||
9. **`harmony/src/modules/okd/bootstrap_02_bootstrap.rs`** — 2 call sites, critical OKD bootstrap path
|
||||
|
||||
### 2.4 Remove `harmony_secret` from direct dependencies
|
||||
|
||||
After all call sites are migrated:
|
||||
|
||||
1. Remove `harmony_secret` from `Cargo.toml` of: `harmony`, `brocade`, and all examples that had it
|
||||
2. `harmony_config` keeps `harmony_secret` as a dependency (for `StoreSource`)
|
||||
3. The `Secret` trait and `SecretManager` remain in `harmony_secret` but are not used directly anymore
|
||||
|
||||
### 2.5 Backward compatibility for existing local secrets
|
||||
|
||||
Users who already have secrets stored via `LocalFileSecretStore` (JSON files in `~/.local/share/harmony/secrets/`) need a migration path:
|
||||
|
||||
- On first run after upgrade, if SQLite has no entry for a key but the old JSON file exists, read from JSON and write to SQLite
|
||||
- Or: add `LocalFileSource` as a fallback source at the end of the chain (read-only) for one release cycle
|
||||
- Log a deprecation warning when reading from old JSON files
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] `harmony_config::init()` called in `harmony_cli::run()` and `harmony_tui::run()`
|
||||
- [ ] All 19 call sites migrated from `SecretManager` to `harmony_config`
|
||||
- [ ] `harmony_secret` removed from direct dependencies of `harmony`, `brocade`, and all examples
|
||||
- [ ] Backward compatibility for existing local JSON secrets
|
||||
- [ ] All existing unit tests still pass
|
||||
- [ ] Manual verification: one migrated example works end-to-end (prompt → persist → read)
|
||||
141
ROADMAP/03-assets-crate.md
Normal file
141
ROADMAP/03-assets-crate.md
Normal file
@@ -0,0 +1,141 @@
|
||||
# Phase 3: Complete `harmony_assets`, Refactor Consumers
|
||||
|
||||
## Goal
|
||||
|
||||
Make `harmony_assets` the single way to manage downloadable binaries and images across Harmony. Eliminate `k3d::DownloadableAsset` duplication, implement `Url::Url` in OPNsense infra, remove LFS-tracked files from git.
|
||||
|
||||
## Current State
|
||||
|
||||
- `harmony_assets` exists with `Asset`, `LocalCache`, `LocalStore`, `S3Store` (behind feature flag). CLI with `upload`, `download`, `checksum`, `verify` commands. **No tests. Zero consumers.**
|
||||
- `k3d/src/downloadable_asset.rs` has the same functionality with full test coverage (httptest mock server, checksum verification, cache hit, 404 handling, checksum failure).
|
||||
- `Url::Url` variant in `harmony_types/src/net.rs` exists but is `todo!()` in OPNsense TFTP and HTTP infra layers.
|
||||
- OKD modules hardcode `./data/...` paths (`bootstrap_02_bootstrap.rs:84-88`, `ipxe.rs:73`).
|
||||
- `data/` directory contains ~3GB of LFS-tracked files (OKD binaries, PXE images, SCOS images).
|
||||
|
||||
## Tasks
|
||||
|
||||
### 3.1 Port k3d tests to `harmony_assets`
|
||||
|
||||
The k3d crate has 5 well-written tests in `downloadable_asset.rs`. Port them to test `harmony_assets::LocalStore`:
|
||||
|
||||
```rust
|
||||
// harmony_assets/tests/local_store.rs (or in src/ as unit tests)
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_downloads_and_verifies_checksum() {
|
||||
// Start httptest server serving a known file
|
||||
// Create Asset with URL pointing to mock server
|
||||
// Fetch via LocalStore
|
||||
// Assert file exists at expected cache path
|
||||
// Assert checksum matches
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_returns_cached_file_when_present() {
|
||||
// Pre-populate cache with correct file
|
||||
// Fetch — assert no HTTP request made (mock server not hit)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_fails_on_404() { ... }
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_fails_on_checksum_mismatch() { ... }
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_with_progress_callback() {
|
||||
// Assert progress callback is called with (bytes_received, total_size)
|
||||
}
|
||||
```
|
||||
|
||||
Add `httptest` to `[dev-dependencies]` of `harmony_assets`.
|
||||
|
||||
### 3.2 Refactor `k3d` to use `harmony_assets`
|
||||
|
||||
Replace `k3d/src/downloadable_asset.rs` with calls to `harmony_assets`:
|
||||
|
||||
```rust
|
||||
// k3d/src/lib.rs — in download_latest_release()
|
||||
use harmony_assets::{Asset, LocalCache, LocalStore, ChecksumAlgo};
|
||||
|
||||
let asset = Asset::new(
|
||||
binary_url,
|
||||
checksum,
|
||||
ChecksumAlgo::SHA256,
|
||||
K3D_BIN_FILE_NAME.to_string(),
|
||||
);
|
||||
let cache = LocalCache::new(self.base_dir.clone());
|
||||
let store = LocalStore::new();
|
||||
let path = store.fetch(&asset, &cache, None).await
|
||||
.map_err(|e| format!("Failed to download k3d: {}", e))?;
|
||||
```
|
||||
|
||||
Delete `k3d/src/downloadable_asset.rs`. Update k3d's `Cargo.toml` to depend on `harmony_assets`.
|
||||
|
||||
### 3.3 Define asset metadata as config structs
|
||||
|
||||
Following `plan.md` Phase 2, create typed config for OKD assets using `harmony_config`:
|
||||
|
||||
```rust
|
||||
// harmony/src/modules/okd/config.rs
|
||||
#[derive(Config, Serialize, Deserialize, JsonSchema, InteractiveParse)]
|
||||
struct OkdInstallerConfig {
|
||||
pub openshift_install_url: String,
|
||||
pub openshift_install_sha256: String,
|
||||
pub scos_kernel_url: String,
|
||||
pub scos_kernel_sha256: String,
|
||||
pub scos_initramfs_url: String,
|
||||
pub scos_initramfs_sha256: String,
|
||||
pub scos_rootfs_url: String,
|
||||
pub scos_rootfs_sha256: String,
|
||||
}
|
||||
```
|
||||
|
||||
First run prompts for URLs/checksums (or uses compiled-in defaults). Values persist to SQLite. Can be overridden via env vars or OpenBao.
|
||||
|
||||
### 3.4 Implement `Url::Url` in OPNsense infra layer
|
||||
|
||||
In `harmony/src/infra/opnsense/http.rs` and `tftp.rs`, implement the `Url::Url(url)` match arm:
|
||||
|
||||
```rust
|
||||
// Instead of SCP-ing files to OPNsense:
|
||||
// SSH into OPNsense, run: fetch -o /usr/local/http/{path} {url}
|
||||
// (FreeBSD-native HTTP client, no extra deps on OPNsense)
|
||||
```
|
||||
|
||||
This eliminates the manual `scp` workaround and the `inquire::Confirm` prompts in `ipxe.rs:126` and `bootstrap_02_bootstrap.rs:230`.
|
||||
|
||||
### 3.5 Refactor OKD modules to use assets + config
|
||||
|
||||
In `bootstrap_02_bootstrap.rs`:
|
||||
- `openshift-install`: Resolve `OkdInstallerConfig` from `harmony_config`, download via `harmony_assets`, invoke from cache.
|
||||
- SCOS images: Pass `Url::Url(scos_kernel_url)` etc. to `StaticFilesHttpScore`. OPNsense fetches from S3 directly.
|
||||
- Remove `oc` and `kubectl` from `data/okd/bin/` (never used by code).
|
||||
|
||||
In `ipxe.rs`:
|
||||
- Replace the folder-to-serve SCP workaround with individual `Url::Url` entries.
|
||||
- Remove the `inquire::Confirm` SCP prompts.
|
||||
|
||||
### 3.6 Upload assets to S3
|
||||
|
||||
- Upload all current `data/` binaries to Ceph S3 bucket with path scheme: `harmony-assets/okd/v{version}/openshift-install`, `harmony-assets/pxe/centos-stream-9/install.img`, etc.
|
||||
- Set public-read ACL or configure presigned URL generation.
|
||||
- Record S3 URLs and SHA256 checksums as defaults in the config structs.
|
||||
|
||||
### 3.7 Remove LFS, clean git
|
||||
|
||||
- Remove all LFS-tracked files from the repo.
|
||||
- Update `.gitattributes` to remove LFS filters.
|
||||
- Keep `data/` in `.gitignore` (it becomes a local cache directory).
|
||||
- Optionally use `git filter-repo` or BFG to strip LFS objects from history (required before Phase 4 GitHub publish).
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] `harmony_assets` has tests ported from k3d pattern (5+ tests with httptest)
|
||||
- [ ] `k3d::DownloadableAsset` replaced by `harmony_assets` usage
|
||||
- [ ] `OkdInstallerConfig` struct using `harmony_config`
|
||||
- [ ] `Url::Url` implemented in OPNsense HTTP and TFTP infra
|
||||
- [ ] OKD bootstrap refactored to use lazy-download pattern
|
||||
- [ ] Assets uploaded to S3 with documented URLs/checksums
|
||||
- [ ] LFS removed, git history cleaned
|
||||
- [ ] Repo size small enough for GitHub (~code + templates only)
|
||||
110
ROADMAP/04-publish-github.md
Normal file
110
ROADMAP/04-publish-github.md
Normal file
@@ -0,0 +1,110 @@
|
||||
# Phase 4: Publish to GitHub
|
||||
|
||||
## Goal
|
||||
|
||||
Make Harmony publicly available on GitHub as the primary community hub for issues, pull requests, and discussions. CI runs on self-hosted runners.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Phase 3 complete: LFS removed, git history cleaned, repo is small
|
||||
- README polished with quick-start, architecture overview, examples
|
||||
- All existing tests pass
|
||||
|
||||
## Tasks
|
||||
|
||||
### 4.1 Clean git history
|
||||
|
||||
```bash
|
||||
# Option A: git filter-repo (preferred)
|
||||
git filter-repo --strip-blobs-bigger-than 10M
|
||||
|
||||
# Option B: BFG Repo Cleaner
|
||||
bfg --strip-blobs-bigger-than 10M
|
||||
git reflog expire --expire=now --all
|
||||
git gc --prune=now --aggressive
|
||||
```
|
||||
|
||||
Verify final repo size is reasonable (target: <50MB including all code, docs, templates).
|
||||
|
||||
### 4.2 Create GitHub repository
|
||||
|
||||
- Create `NationTech/harmony` (or chosen org/name) on GitHub
|
||||
- Push cleaned repo as initial commit
|
||||
- Set default branch to `main` (rename from `master` if desired)
|
||||
|
||||
### 4.3 Set up CI on self-hosted runners
|
||||
|
||||
GitHub is the community hub, but CI runs on your own infrastructure. Options:
|
||||
|
||||
**Option A: GitHub Actions with self-hosted runners**
|
||||
- Register your Gitea runner machines as GitHub Actions self-hosted runners
|
||||
- Port `.gitea/workflows/check.yml` to `.github/workflows/check.yml`
|
||||
- Same Docker image (`hub.nationtech.io/harmony/harmony_composer:latest`), same commands
|
||||
- Pro: native GitHub PR checks, no external service needed
|
||||
- Con: runners need outbound access to GitHub API
|
||||
|
||||
**Option B: External CI (Woodpecker, Drone, Jenkins)**
|
||||
- Use any CI that supports webhooks from GitHub
|
||||
- Report status back to GitHub via commit status API / checks API
|
||||
- Pro: fully self-hosted, no GitHub dependency for builds
|
||||
- Con: extra integration work
|
||||
|
||||
**Option C: Keep Gitea CI, mirror from GitHub**
|
||||
- GitHub repo has a webhook that triggers Gitea CI on push
|
||||
- Gitea reports back to GitHub via commit status API
|
||||
- Pro: no migration of CI config
|
||||
- Con: fragile webhook chain
|
||||
|
||||
**Recommendation**: Option A. GitHub Actions self-hosted runners are straightforward and give the best contributor UX (native PR checks). The workflow files are nearly identical to Gitea workflows.
|
||||
|
||||
```yaml
|
||||
# .github/workflows/check.yml
|
||||
name: Check
|
||||
on: [push, pull_request]
|
||||
jobs:
|
||||
check:
|
||||
runs-on: self-hosted
|
||||
container:
|
||||
image: hub.nationtech.io/harmony/harmony_composer:latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: bash build/check.sh
|
||||
```
|
||||
|
||||
### 4.4 Polish documentation
|
||||
|
||||
- **README.md**: Quick-start (clone → run → get prompted → see result), architecture diagram (Score → Interpret → Topology), link to docs and examples
|
||||
- **CONTRIBUTING.md**: Already exists. Review for GitHub-specific guidance (fork workflow, PR template)
|
||||
- **docs/**: Already comprehensive. Verify links work on GitHub rendering
|
||||
- **Examples**: Ensure each example has a one-line description in its `Cargo.toml` and a comment block in `main.rs`
|
||||
|
||||
### 4.5 License and legal
|
||||
|
||||
- Verify workspace `license` field in root `Cargo.toml` is set correctly
|
||||
- Add `LICENSE` file at repo root if not present
|
||||
- Scan for any proprietary dependencies or hardcoded internal URLs
|
||||
|
||||
### 4.6 GitHub repository configuration
|
||||
|
||||
- Branch protection on `main`: require PR review, require CI to pass
|
||||
- Issue templates: bug report, feature request
|
||||
- PR template: checklist (tests pass, docs updated, etc.)
|
||||
- Topics/tags: `rust`, `infrastructure-as-code`, `kubernetes`, `orchestration`, `bare-metal`
|
||||
- Repository description: "Infrastructure orchestration framework. Declare what you want (Score), describe your infrastructure (Topology), let Harmony figure out how."
|
||||
|
||||
### 4.7 Gitea as internal mirror
|
||||
|
||||
- Set up Gitea to mirror from GitHub (pull mirror)
|
||||
- Internal CI can continue running on Gitea for private/experimental branches
|
||||
- Public contributions flow through GitHub
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] Git history cleaned, repo size <50MB
|
||||
- [ ] Public GitHub repository created
|
||||
- [ ] CI running on self-hosted runners with GitHub Actions
|
||||
- [ ] Branch protection enabled
|
||||
- [ ] README polished with quick-start guide
|
||||
- [ ] Issue and PR templates created
|
||||
- [ ] LICENSE file present
|
||||
- [ ] Gitea configured as mirror
|
||||
255
ROADMAP/05-e2e-tests-simple.md
Normal file
255
ROADMAP/05-e2e-tests-simple.md
Normal file
@@ -0,0 +1,255 @@
|
||||
# Phase 5: E2E Tests for PostgreSQL & RustFS
|
||||
|
||||
## Goal
|
||||
|
||||
Establish an automated E2E test pipeline that proves working examples actually work. Start with the two simplest k8s-based examples: PostgreSQL and RustFS.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Phase 1 complete (config crate works, bootstrap is clean)
|
||||
- `feat/rustfs` branch merged
|
||||
|
||||
## Architecture
|
||||
|
||||
### Test harness: `tests/e2e/`
|
||||
|
||||
A dedicated workspace member crate at `tests/e2e/` that contains:
|
||||
|
||||
1. **Shared k3d utilities** — create/destroy clusters, wait for readiness
|
||||
2. **Per-example test modules** — each example gets a `#[tokio::test]` function
|
||||
3. **Assertion helpers** — wait for pods, check CRDs exist, verify services
|
||||
|
||||
```
|
||||
tests/
|
||||
e2e/
|
||||
Cargo.toml
|
||||
src/
|
||||
lib.rs # Shared test utilities
|
||||
k3d.rs # k3d cluster lifecycle
|
||||
k8s_assert.rs # K8s assertion helpers
|
||||
tests/
|
||||
postgresql.rs # PostgreSQL E2E test
|
||||
rustfs.rs # RustFS E2E test
|
||||
```
|
||||
|
||||
### k3d cluster lifecycle
|
||||
|
||||
```rust
|
||||
// tests/e2e/src/k3d.rs
|
||||
use k3d_rs::K3d;
|
||||
|
||||
pub struct TestCluster {
|
||||
pub name: String,
|
||||
pub k3d: K3d,
|
||||
pub client: kube::Client,
|
||||
reuse: bool,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
/// Creates a k3d cluster for testing.
|
||||
/// If HARMONY_E2E_REUSE_CLUSTER=1, reuses existing cluster.
|
||||
pub async fn ensure(name: &str) -> Result<Self, String> {
|
||||
let reuse = std::env::var("HARMONY_E2E_REUSE_CLUSTER")
|
||||
.map(|v| v == "1")
|
||||
.unwrap_or(false);
|
||||
|
||||
let base_dir = PathBuf::from("/tmp/harmony-e2e");
|
||||
let k3d = K3d::new(base_dir, Some(name.to_string()));
|
||||
|
||||
let client = k3d.ensure_installed().await?;
|
||||
|
||||
Ok(Self { name: name.to_string(), k3d, client, reuse })
|
||||
}
|
||||
|
||||
/// Returns the kubeconfig path for this cluster.
|
||||
pub fn kubeconfig_path(&self) -> String { ... }
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
if !self.reuse {
|
||||
// Best-effort cleanup
|
||||
let _ = self.k3d.run_k3d_command(["cluster", "delete", &self.name]);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### K8s assertion helpers
|
||||
|
||||
```rust
|
||||
// tests/e2e/src/k8s_assert.rs
|
||||
|
||||
/// Wait until a pod matching the label selector is Running in the namespace.
|
||||
/// Times out after `timeout` duration.
|
||||
pub async fn wait_for_pod_running(
|
||||
client: &kube::Client,
|
||||
namespace: &str,
|
||||
label_selector: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), String>
|
||||
|
||||
/// Assert a CRD instance exists.
|
||||
pub async fn assert_resource_exists<K: kube::Resource>(
|
||||
client: &kube::Client,
|
||||
name: &str,
|
||||
namespace: Option<&str>,
|
||||
) -> Result<(), String>
|
||||
|
||||
/// Install a Helm chart. Returns when all pods in the release are running.
|
||||
pub async fn helm_install(
|
||||
release_name: &str,
|
||||
chart: &str,
|
||||
namespace: &str,
|
||||
repo_url: Option<&str>,
|
||||
timeout: Duration,
|
||||
) -> Result<(), String>
|
||||
```
|
||||
|
||||
## Tasks
|
||||
|
||||
### 5.1 Create the `tests/e2e/` crate
|
||||
|
||||
Add to workspace `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[workspace]
|
||||
members = [
|
||||
# ... existing members
|
||||
"tests/e2e",
|
||||
]
|
||||
```
|
||||
|
||||
`tests/e2e/Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[package]
|
||||
name = "harmony-e2e-tests"
|
||||
edition = "2024"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
k3d_rs = { path = "../../k3d", package = "k3d_rs" }
|
||||
kube = { workspace = true }
|
||||
k8s-openapi = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
log = { workspace = true }
|
||||
env_logger = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = { workspace = true }
|
||||
```
|
||||
|
||||
### 5.2 PostgreSQL E2E test
|
||||
|
||||
```rust
|
||||
// tests/e2e/tests/postgresql.rs
|
||||
use harmony::modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig};
|
||||
use harmony::topology::K8sAnywhereTopology;
|
||||
use harmony::inventory::Inventory;
|
||||
use harmony::maestro::Maestro;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_postgresql_deploys_on_k3d() {
|
||||
let cluster = TestCluster::ensure("harmony-e2e-pg").await.unwrap();
|
||||
|
||||
// Install CNPG operator via Helm
|
||||
// (K8sAnywhereTopology::ensure_ready() now handles this since
|
||||
// commit e1183ef "K8s postgresql score now ensures cnpg is installed")
|
||||
// But we may need the Helm chart for non-OKD:
|
||||
helm_install(
|
||||
"cnpg",
|
||||
"cloudnative-pg",
|
||||
"cnpg-system",
|
||||
Some("https://cloudnative-pg.github.io/charts"),
|
||||
Duration::from_secs(120),
|
||||
).await.unwrap();
|
||||
|
||||
// Configure topology pointing to test cluster
|
||||
let config = K8sAnywhereConfig {
|
||||
kubeconfig: Some(cluster.kubeconfig_path()),
|
||||
use_local_k3d: false,
|
||||
autoinstall: false,
|
||||
use_system_kubeconfig: false,
|
||||
harmony_profile: "dev".to_string(),
|
||||
k8s_context: None,
|
||||
};
|
||||
let topology = K8sAnywhereTopology::with_config(config);
|
||||
|
||||
// Create and run the score
|
||||
let score = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "e2e-test-pg".to_string(),
|
||||
namespace: "e2e-pg-test".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
let mut maestro = Maestro::initialize(Inventory::autoload(), topology).await.unwrap();
|
||||
maestro.register_all(vec![Box::new(score)]);
|
||||
|
||||
let scores = maestro.scores().read().unwrap().first().unwrap().clone_box();
|
||||
let result = maestro.interpret(scores).await;
|
||||
assert!(result.is_ok(), "PostgreSQL score failed: {:?}", result.err());
|
||||
|
||||
// Assert: CNPG Cluster resource exists
|
||||
// (the Cluster CRD is applied — pod readiness may take longer)
|
||||
let client = cluster.client.clone();
|
||||
// ... assert Cluster CRD exists in e2e-pg-test namespace
|
||||
}
|
||||
```
|
||||
|
||||
### 5.3 RustFS E2E test
|
||||
|
||||
Similar structure. Details depend on what the RustFS score deploys (likely a Helm chart or k8s resources for MinIO/RustFS).
|
||||
|
||||
```rust
|
||||
#[tokio::test]
|
||||
async fn test_rustfs_deploys_on_k3d() {
|
||||
let cluster = TestCluster::ensure("harmony-e2e-rustfs").await.unwrap();
|
||||
// ... similar pattern: configure topology, create score, interpret, assert
|
||||
}
|
||||
```
|
||||
|
||||
### 5.4 CI job for E2E tests
|
||||
|
||||
New workflow file (Gitea or GitHub Actions):
|
||||
|
||||
```yaml
|
||||
# .gitea/workflows/e2e.yml (or .github/workflows/e2e.yml)
|
||||
name: E2E Tests
|
||||
on:
|
||||
push:
|
||||
branches: [master, main]
|
||||
# Don't run on every PR — too slow. Run on label or manual trigger.
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
runs-on: self-hosted # Must have Docker available for k3d
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install k3d
|
||||
run: curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
|
||||
|
||||
- name: Run E2E tests
|
||||
run: cargo test -p harmony-e2e-tests -- --test-threads=1
|
||||
env:
|
||||
RUST_LOG: info
|
||||
```
|
||||
|
||||
Note `--test-threads=1`: E2E tests create k3d clusters and should not run in parallel (port conflicts, resource contention).
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] `tests/e2e/` crate added to workspace
|
||||
- [ ] Shared test utilities: `TestCluster`, `wait_for_pod_running`, `helm_install`
|
||||
- [ ] PostgreSQL E2E test passing
|
||||
- [ ] RustFS E2E test passing (after `feat/rustfs` merge)
|
||||
- [ ] CI job running E2E tests on push to main
|
||||
- [ ] `HARMONY_E2E_REUSE_CLUSTER=1` for fast local iteration
|
||||
214
ROADMAP/06-e2e-tests-kvm.md
Normal file
214
ROADMAP/06-e2e-tests-kvm.md
Normal file
@@ -0,0 +1,214 @@
|
||||
# Phase 6: E2E Tests for OKD HA Cluster on KVM
|
||||
|
||||
## Goal
|
||||
|
||||
Prove the full OKD bare-metal installation flow works end-to-end using KVM virtual machines. This is the ultimate validation of Harmony's core value proposition: declare an OKD cluster, point it at infrastructure, watch it materialize.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Phase 5 complete (test harness exists, k3d tests passing)
|
||||
- `feature/kvm-module` merged to main
|
||||
- A CI runner with libvirt/KVM access and nested virtualization support
|
||||
|
||||
## Architecture
|
||||
|
||||
The KVM branch already has a `kvm_okd_ha_cluster` example that creates:
|
||||
|
||||
```
|
||||
Host bridge (WAN)
|
||||
|
|
||||
+--------------------+
|
||||
| OPNsense | 192.168.100.1
|
||||
| gateway + PXE |
|
||||
+--------+-----------+
|
||||
|
|
||||
harmonylan (192.168.100.0/24)
|
||||
+---------+---------+---------+---------+
|
||||
| | | | |
|
||||
+----+---+ +---+---+ +---+---+ +---+---+ +--+----+
|
||||
| cp0 | | cp1 | | cp2 | |worker0| |worker1|
|
||||
| .10 | | .11 | | .12 | | .20 | | .21 |
|
||||
+--------+ +-------+ +-------+ +-------+ +---+---+
|
||||
|
|
||||
+-----+----+
|
||||
| worker2 |
|
||||
| .22 |
|
||||
+----------+
|
||||
```
|
||||
|
||||
The test needs to orchestrate this entire setup, wait for OKD to converge, and assert the cluster is healthy.
|
||||
|
||||
## Tasks
|
||||
|
||||
### 6.1 Start with `example_linux_vm` — the simplest KVM test
|
||||
|
||||
Before tackling the full OKD stack, validate the KVM module itself with the simplest possible test:
|
||||
|
||||
```rust
|
||||
// tests/e2e/tests/kvm_linux_vm.rs
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires libvirt access — run with: cargo test -- --ignored
|
||||
async fn test_linux_vm_boots_from_iso() {
|
||||
let executor = KvmExecutor::from_env().unwrap();
|
||||
|
||||
// Create isolated network
|
||||
let network = NetworkConfig {
|
||||
name: "e2e-test-net".to_string(),
|
||||
bridge: "virbr200".to_string(),
|
||||
// ...
|
||||
};
|
||||
executor.ensure_network(&network).await.unwrap();
|
||||
|
||||
// Define and start VM
|
||||
let vm_config = VmConfig::builder("e2e-linux-test")
|
||||
.vcpus(1)
|
||||
.memory_gb(1)
|
||||
.disk(5)
|
||||
.network(NetworkRef::named("e2e-test-net"))
|
||||
.cdrom("https://releases.ubuntu.com/24.04/ubuntu-24.04-live-server-amd64.iso")
|
||||
.boot_order([BootDevice::Cdrom, BootDevice::Disk])
|
||||
.build();
|
||||
|
||||
executor.ensure_vm(&vm_config).await.unwrap();
|
||||
executor.start_vm("e2e-linux-test").await.unwrap();
|
||||
|
||||
// Assert VM is running
|
||||
let status = executor.vm_status("e2e-linux-test").await.unwrap();
|
||||
assert_eq!(status, VmStatus::Running);
|
||||
|
||||
// Cleanup
|
||||
executor.destroy_vm("e2e-linux-test").await.unwrap();
|
||||
executor.undefine_vm("e2e-linux-test").await.unwrap();
|
||||
executor.delete_network("e2e-test-net").await.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
This test validates:
|
||||
- ISO download works (via `harmony_assets` if refactored, or built-in KVM module download)
|
||||
- libvirt XML generation is correct
|
||||
- VM lifecycle (define → start → status → destroy → undefine)
|
||||
- Network creation/deletion
|
||||
|
||||
### 6.2 OKD HA Cluster E2E test
|
||||
|
||||
The full integration test. This is long-running (30-60 minutes) and should only run nightly or on-demand.
|
||||
|
||||
```rust
|
||||
// tests/e2e/tests/kvm_okd_ha.rs
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore] // Requires KVM + significant resources. Run nightly.
|
||||
async fn test_okd_ha_cluster_on_kvm() {
|
||||
// 1. Create virtual infrastructure
|
||||
// - OPNsense gateway VM
|
||||
// - 3 control plane VMs
|
||||
// - 3 worker VMs
|
||||
// - Virtual network (harmonylan)
|
||||
|
||||
// 2. Run OKD installation scores
|
||||
// (the kvm_okd_ha_cluster example, but as a test)
|
||||
|
||||
// 3. Wait for OKD API server to become reachable
|
||||
// - Poll https://api.okd.harmonylan:6443 until it responds
|
||||
// - Timeout: 30 minutes
|
||||
|
||||
// 4. Assert cluster health
|
||||
// - All nodes in Ready state
|
||||
// - ClusterVersion reports Available=True
|
||||
// - Sample workload (nginx) deploys and pod reaches Running
|
||||
|
||||
// 5. Cleanup
|
||||
// - Destroy all VMs
|
||||
// - Delete virtual networks
|
||||
// - Clean up disk images
|
||||
}
|
||||
```
|
||||
|
||||
### 6.3 CI runner requirements
|
||||
|
||||
The KVM E2E test needs a runner with:
|
||||
|
||||
- **Hardware**: 32GB+ RAM, 8+ CPU cores, 100GB+ disk
|
||||
- **Software**: libvirt, QEMU/KVM, `virsh`, nested virtualization enabled
|
||||
- **Network**: Outbound internet access (to download ISOs, OKD images)
|
||||
- **Permissions**: User in `libvirt` group, or root access
|
||||
|
||||
Options:
|
||||
- **Dedicated bare-metal machine** registered as a self-hosted GitHub Actions runner
|
||||
- **Cloud VM with nested virt** (e.g., GCP n2-standard-8 with `--enable-nested-virtualization`)
|
||||
- **Manual trigger only** — developer runs locally, CI just tracks pass/fail
|
||||
|
||||
### 6.4 Nightly CI job
|
||||
|
||||
```yaml
|
||||
# .github/workflows/e2e-kvm.yml
|
||||
name: E2E KVM Tests
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 2 * * *' # 2 AM daily
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
jobs:
|
||||
kvm-tests:
|
||||
runs-on: [self-hosted, kvm] # Label for KVM-capable runners
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Run KVM E2E tests
|
||||
run: cargo test -p harmony-e2e-tests -- --ignored --test-threads=1
|
||||
env:
|
||||
RUST_LOG: info
|
||||
HARMONY_KVM_URI: qemu:///system
|
||||
|
||||
- name: Cleanup VMs on failure
|
||||
if: failure()
|
||||
run: |
|
||||
virsh list --all --name | grep e2e | xargs -I {} virsh destroy {} || true
|
||||
virsh list --all --name | grep e2e | xargs -I {} virsh undefine {} --remove-all-storage || true
|
||||
```
|
||||
|
||||
### 6.5 Test resource management
|
||||
|
||||
KVM tests create real resources that must be cleaned up even on failure. Implement a test fixture pattern:
|
||||
|
||||
```rust
|
||||
struct KvmTestFixture {
|
||||
executor: KvmExecutor,
|
||||
vms: Vec<String>,
|
||||
networks: Vec<String>,
|
||||
}
|
||||
|
||||
impl KvmTestFixture {
|
||||
fn track_vm(&mut self, name: &str) { self.vms.push(name.to_string()); }
|
||||
fn track_network(&mut self, name: &str) { self.networks.push(name.to_string()); }
|
||||
}
|
||||
|
||||
impl Drop for KvmTestFixture {
|
||||
fn drop(&mut self) {
|
||||
// Best-effort cleanup of all tracked resources
|
||||
for vm in &self.vms {
|
||||
let _ = std::process::Command::new("virsh")
|
||||
.args(["destroy", vm]).output();
|
||||
let _ = std::process::Command::new("virsh")
|
||||
.args(["undefine", vm, "--remove-all-storage"]).output();
|
||||
}
|
||||
for net in &self.networks {
|
||||
let _ = std::process::Command::new("virsh")
|
||||
.args(["net-destroy", net]).output();
|
||||
let _ = std::process::Command::new("virsh")
|
||||
.args(["net-undefine", net]).output();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] `test_linux_vm_boots_from_iso` — passing KVM smoke test
|
||||
- [ ] `test_okd_ha_cluster_on_kvm` — full OKD installation test
|
||||
- [ ] `KvmTestFixture` with resource cleanup on test failure
|
||||
- [ ] Nightly CI job on KVM-capable runner
|
||||
- [ ] Force-cleanup script for leaked VMs/networks
|
||||
- [ ] Documentation: how to set up a KVM runner for E2E tests
|
||||
9
book.toml
Normal file
9
book.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[book]
|
||||
title = "Harmony"
|
||||
description = "Infrastructure orchestration that treats your platform like first-class code"
|
||||
src = "docs"
|
||||
build-dir = "book"
|
||||
authors = ["NationTech"]
|
||||
|
||||
[output.html]
|
||||
mathjax-support = false
|
||||
4
brocade/examples/env.sh
Normal file
4
brocade/examples/env.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
export HARMONY_SECRET_NAMESPACE=brocade-example
|
||||
export HARMONY_SECRET_STORE=file
|
||||
export HARMONY_DATABASE_URL=sqlite://harmony_brocade_example.sqlite
|
||||
export RUST_LOG=info
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::net::{IpAddr, Ipv4Addr};
|
||||
|
||||
use brocade::{BrocadeOptions, ssh};
|
||||
use brocade::{BrocadeOptions, Vlan, ssh};
|
||||
use harmony_secret::{Secret, SecretManager};
|
||||
use harmony_types::switch::PortLocation;
|
||||
use schemars::JsonSchema;
|
||||
@@ -17,9 +17,12 @@ async fn main() {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||
|
||||
// let ip = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 250)); // old brocade @ ianlet
|
||||
let ip = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); // brocade @ sto1
|
||||
// let ip = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); // brocade @ sto1
|
||||
// let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 4, 11)); // brocade @ st
|
||||
let switch_addresses = vec![ip];
|
||||
//let switch_addresses = vec![ip];
|
||||
let ip0 = IpAddr::V4(Ipv4Addr::new(192, 168, 12, 147)); // brocade @ test
|
||||
let ip1 = IpAddr::V4(Ipv4Addr::new(192, 168, 12, 109)); // brocade @ test
|
||||
let switch_addresses = vec![ip0, ip1];
|
||||
|
||||
let config = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
|
||||
.await
|
||||
@@ -32,7 +35,7 @@ async fn main() {
|
||||
&BrocadeOptions {
|
||||
dry_run: true,
|
||||
ssh: ssh::SshOptions {
|
||||
port: 2222,
|
||||
port: 22,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
@@ -58,12 +61,32 @@ async fn main() {
|
||||
}
|
||||
|
||||
println!("--------------");
|
||||
todo!();
|
||||
println!("Creating VLAN 100 (test-vlan)...");
|
||||
brocade
|
||||
.create_vlan(&Vlan {
|
||||
id: 100,
|
||||
name: "test-vlan".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
println!("--------------");
|
||||
println!("Deleting VLAN 100...");
|
||||
brocade
|
||||
.delete_vlan(&Vlan {
|
||||
id: 100,
|
||||
name: "test-vlan".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
println!("--------------");
|
||||
todo!("STOP!");
|
||||
let channel_name = "1";
|
||||
brocade.clear_port_channel(channel_name).await.unwrap();
|
||||
|
||||
println!("--------------");
|
||||
let channel_id = brocade.find_available_channel_id().await.unwrap();
|
||||
let channel_id = 1;
|
||||
|
||||
println!("--------------");
|
||||
let channel_name = "HARMONY_LAG";
|
||||
|
||||
242
brocade/examples/main_vlan_demo.rs
Normal file
242
brocade/examples/main_vlan_demo.rs
Normal file
@@ -0,0 +1,242 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use brocade::{
|
||||
BrocadeOptions, InterfaceConfig, InterfaceSpeed, InterfaceType, PortOperatingMode,
|
||||
SwitchInterface, Vlan, VlanList, ssh,
|
||||
};
|
||||
use harmony_secret::{Secret, SecretManager};
|
||||
use harmony_types::switch::PortLocation;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Secret, Clone, Debug, JsonSchema, Serialize, Deserialize)]
|
||||
struct BrocadeSwitchAuth {
|
||||
username: String,
|
||||
password: String,
|
||||
}
|
||||
|
||||
fn wait_for_enter() {
|
||||
println!("\n--- Press ENTER to continue ---");
|
||||
io::stdout().flush().unwrap();
|
||||
io::stdin().read_line(&mut String::new()).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||
|
||||
let ip0 = std::net::IpAddr::V4(std::net::Ipv4Addr::new(192, 168, 12, 147));
|
||||
let ip1 = std::net::IpAddr::V4(std::net::Ipv4Addr::new(192, 168, 12, 109));
|
||||
let switch_addresses = vec![ip0, ip1];
|
||||
|
||||
let config = SecretManager::get_or_prompt::<BrocadeSwitchAuth>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let brocade = brocade::init(
|
||||
&switch_addresses,
|
||||
&config.username,
|
||||
&config.password,
|
||||
&BrocadeOptions {
|
||||
dry_run: false,
|
||||
ssh: ssh::SshOptions {
|
||||
port: 22,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("Brocade client failed to connect");
|
||||
|
||||
println!("=== Connecting to Brocade switches ===");
|
||||
let version = brocade.version().await.unwrap();
|
||||
println!("Version: {version:?}");
|
||||
let entries = brocade.get_stack_topology().await.unwrap();
|
||||
println!("Stack topology: {entries:#?}");
|
||||
|
||||
println!("\n=== Creating VLANs 100, 200, 300 ===");
|
||||
brocade
|
||||
.create_vlan(&Vlan {
|
||||
id: 100,
|
||||
name: "vlan100".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Created VLAN 100 (vlan100)");
|
||||
brocade
|
||||
.create_vlan(&Vlan {
|
||||
id: 200,
|
||||
name: "vlan200".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Created VLAN 200 (vlan200)");
|
||||
brocade
|
||||
.create_vlan(&Vlan {
|
||||
id: 300,
|
||||
name: "vlan300".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Created VLAN 300 (vlan300)");
|
||||
|
||||
println!("\n=== Press ENTER to continue to port configuration tests ---");
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEST 1: Trunk port (all VLANs, speed 10Gbps) on TenGigabitEthernet 1/0/1 ===");
|
||||
println!("Configuring port as trunk with all VLANs and speed 10Gbps...");
|
||||
let configs = vec![InterfaceConfig {
|
||||
interface: SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(1, 0, 1),
|
||||
),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: Some(InterfaceSpeed::Gbps10),
|
||||
}];
|
||||
brocade.configure_interfaces(&configs).await.unwrap();
|
||||
println!("Querying interfaces...");
|
||||
let interfaces = brocade.get_interfaces().await.unwrap();
|
||||
for iface in &interfaces {
|
||||
if iface.name.contains("1/0/1") {
|
||||
println!(" {iface:?}");
|
||||
}
|
||||
}
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEST 2: Trunk port (specific VLANs) on TenGigabitEthernet 1/0/2 ===");
|
||||
println!("Configuring port as trunk with VLANs 100, 200...");
|
||||
let configs = vec![InterfaceConfig {
|
||||
interface: SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(1, 0, 2),
|
||||
),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::Specific(vec![
|
||||
Vlan {
|
||||
id: 100,
|
||||
name: "vlan100".to_string(),
|
||||
},
|
||||
Vlan {
|
||||
id: 200,
|
||||
name: "vlan200".to_string(),
|
||||
},
|
||||
])),
|
||||
speed: None,
|
||||
}];
|
||||
brocade.configure_interfaces(&configs).await.unwrap();
|
||||
println!("Querying interfaces...");
|
||||
let interfaces = brocade.get_interfaces().await.unwrap();
|
||||
for iface in &interfaces {
|
||||
if iface.name.contains("1/0/2") {
|
||||
println!(" {iface:?}");
|
||||
}
|
||||
}
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEST 3: Access port (default VLAN 1) on TenGigabitEthernet 1/0/3 ===");
|
||||
println!("Configuring port as access (default VLAN 1)...");
|
||||
let configs = vec![InterfaceConfig {
|
||||
interface: SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(1, 0, 3),
|
||||
),
|
||||
mode: PortOperatingMode::Access,
|
||||
access_vlan: None,
|
||||
trunk_vlans: None,
|
||||
speed: None,
|
||||
}];
|
||||
brocade.configure_interfaces(&configs).await.unwrap();
|
||||
println!("Querying interfaces...");
|
||||
let interfaces = brocade.get_interfaces().await.unwrap();
|
||||
for iface in &interfaces {
|
||||
if iface.name.contains("1/0/3") {
|
||||
println!(" {iface:?}");
|
||||
}
|
||||
}
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEST 4: Access port (custom VLAN 100) on TenGigabitEthernet 1/0/4 ===");
|
||||
println!("Configuring port as access with VLAN 100...");
|
||||
let configs = vec![InterfaceConfig {
|
||||
interface: SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(1, 0, 4),
|
||||
),
|
||||
mode: PortOperatingMode::Access,
|
||||
access_vlan: Some(100),
|
||||
trunk_vlans: None,
|
||||
speed: None,
|
||||
}];
|
||||
brocade.configure_interfaces(&configs).await.unwrap();
|
||||
println!("Querying interfaces...");
|
||||
let interfaces = brocade.get_interfaces().await.unwrap();
|
||||
for iface in &interfaces {
|
||||
if iface.name.contains("1/0/4") {
|
||||
println!(" {iface:?}");
|
||||
}
|
||||
}
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEST 5: Port-channel on TenGigabitEthernet 1/0/5 and 1/0/6 ===");
|
||||
let channel_id = 1;
|
||||
println!("Using channel ID: {channel_id}");
|
||||
println!("Creating port-channel with ports 1/0/5 and 1/0/6...");
|
||||
let ports = [PortLocation(1, 0, 5), PortLocation(1, 0, 6)];
|
||||
brocade
|
||||
.create_port_channel(channel_id, "HARMONY_LAG", &ports)
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Port-channel created.");
|
||||
println!("Querying port-channel summary...");
|
||||
let interfaces = brocade.get_interfaces().await.unwrap();
|
||||
for iface in &interfaces {
|
||||
if iface.name.contains("1/0/5") || iface.name.contains("1/0/6") {
|
||||
println!(" {iface:?}");
|
||||
}
|
||||
}
|
||||
wait_for_enter();
|
||||
|
||||
println!("\n=== TEARDOWN: Clearing port-channels and deleting VLANs ===");
|
||||
println!("Clearing port-channel {channel_id}...");
|
||||
brocade
|
||||
.clear_port_channel(&channel_id.to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Resetting interfaces...");
|
||||
for port in 1..=6 {
|
||||
let interface = format!("TenGigabitEthernet 1/0/{port}");
|
||||
println!(" Resetting {interface}...");
|
||||
brocade.reset_interface(&interface).await.unwrap();
|
||||
}
|
||||
|
||||
println!("Deleting VLAN 100...");
|
||||
brocade
|
||||
.delete_vlan(&Vlan {
|
||||
id: 100,
|
||||
name: "vlan100".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Deleting VLAN 200...");
|
||||
brocade
|
||||
.delete_vlan(&Vlan {
|
||||
id: 200,
|
||||
name: "vlan200".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
println!("Deleting VLAN 300...");
|
||||
brocade
|
||||
.delete_vlan(&Vlan {
|
||||
id: 300,
|
||||
name: "vlan300".to_string(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
println!("\n=== DONE ===");
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::BrocadeClient;
|
||||
use crate::{
|
||||
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry,
|
||||
PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address,
|
||||
BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceConfig, InterfaceInfo,
|
||||
MacAddressEntry, PortChannelId, PortOperatingMode, Vlan, parse_brocade_mac_address,
|
||||
shell::BrocadeShell,
|
||||
};
|
||||
|
||||
@@ -139,10 +139,15 @@ impl BrocadeClient for FastIronClient {
|
||||
todo!()
|
||||
}
|
||||
|
||||
async fn configure_interfaces(
|
||||
&self,
|
||||
_interfaces: &Vec<(String, PortOperatingMode)>,
|
||||
) -> Result<(), Error> {
|
||||
async fn configure_interfaces(&self, _interfaces: &Vec<InterfaceConfig>) -> Result<(), Error> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
async fn create_vlan(&self, _vlan: &Vlan) -> Result<(), Error> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
async fn delete_vlan(&self, _vlan: &Vlan) -> Result<(), Error> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -195,6 +200,25 @@ impl BrocadeClient for FastIronClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn reset_interface(&self, interface: &str) -> Result<(), Error> {
|
||||
info!("[Brocade] Resetting interface: {interface}");
|
||||
|
||||
let commands = vec![
|
||||
"configure terminal".into(),
|
||||
format!("interface {interface}"),
|
||||
"no switchport".into(),
|
||||
"no speed".into(),
|
||||
"exit".into(),
|
||||
];
|
||||
|
||||
self.shell
|
||||
.run_commands(commands, ExecutionMode::Privileged)
|
||||
.await?;
|
||||
|
||||
info!("[Brocade] Interface '{interface}' reset.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn clear_port_channel(&self, channel_name: &str) -> Result<(), Error> {
|
||||
info!("[Brocade] Clearing port-channel: {channel_name}");
|
||||
|
||||
|
||||
@@ -76,6 +76,74 @@ pub struct MacAddressEntry {
|
||||
|
||||
pub type PortChannelId = u8;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub struct Vlan {
|
||||
pub id: u16,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub enum VlanList {
|
||||
All,
|
||||
Specific(Vec<Vlan>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub enum SwitchInterface {
|
||||
Ethernet(InterfaceType, PortLocation),
|
||||
PortChannel(PortChannelId),
|
||||
}
|
||||
|
||||
impl fmt::Display for SwitchInterface {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
SwitchInterface::Ethernet(itype, loc) => write!(f, "{itype} {loc}"),
|
||||
SwitchInterface::PortChannel(id) => write!(f, "port-channel {id}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub enum InterfaceSpeed {
|
||||
Mbps100,
|
||||
Gbps1,
|
||||
Gbps1Auto,
|
||||
Gbps10,
|
||||
Auto,
|
||||
}
|
||||
|
||||
impl fmt::Display for InterfaceSpeed {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
InterfaceSpeed::Mbps100 => write!(f, "100"),
|
||||
InterfaceSpeed::Gbps1 => write!(f, "1000"),
|
||||
InterfaceSpeed::Gbps1Auto => write!(f, "1000-auto"),
|
||||
InterfaceSpeed::Gbps10 => write!(f, "10000"),
|
||||
InterfaceSpeed::Auto => write!(f, "auto"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub struct InterfaceConfig {
|
||||
pub interface: SwitchInterface,
|
||||
pub mode: PortOperatingMode,
|
||||
pub access_vlan: Option<u16>,
|
||||
pub trunk_vlans: Option<VlanList>,
|
||||
pub speed: Option<InterfaceSpeed>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
pub struct PortChannelConfig {
|
||||
pub id: PortChannelId,
|
||||
pub name: String,
|
||||
pub ports: Vec<PortLocation>,
|
||||
pub mode: PortOperatingMode,
|
||||
pub access_vlan: Option<Vlan>,
|
||||
pub trunk_vlans: Option<VlanList>,
|
||||
pub speed: Option<InterfaceSpeed>,
|
||||
}
|
||||
|
||||
/// Represents a single physical or logical link connecting two switches within a stack or fabric.
|
||||
///
|
||||
/// This structure provides a standardized view of the topology regardless of the
|
||||
@@ -104,16 +172,17 @@ pub struct InterfaceInfo {
|
||||
}
|
||||
|
||||
/// Categorizes the functional type of a switch interface.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize)]
|
||||
pub enum InterfaceType {
|
||||
/// Physical or virtual Ethernet interface (e.g., TenGigabitEthernet, FortyGigabitEthernet).
|
||||
Ethernet(String),
|
||||
TenGigabitEthernet,
|
||||
FortyGigabitEthernet,
|
||||
}
|
||||
|
||||
impl fmt::Display for InterfaceType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
InterfaceType::Ethernet(name) => write!(f, "{name}"),
|
||||
InterfaceType::TenGigabitEthernet => write!(f, "TenGigabitEthernet"),
|
||||
InterfaceType::FortyGigabitEthernet => write!(f, "FortyGigabitEthernet"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -206,10 +275,13 @@ pub trait BrocadeClient: std::fmt::Debug {
|
||||
async fn get_interfaces(&self) -> Result<Vec<InterfaceInfo>, Error>;
|
||||
|
||||
/// Configures a set of interfaces to be operated with a specified mode (access ports, ISL, etc.).
|
||||
async fn configure_interfaces(
|
||||
&self,
|
||||
interfaces: &Vec<(String, PortOperatingMode)>,
|
||||
) -> Result<(), Error>;
|
||||
async fn configure_interfaces(&self, interfaces: &Vec<InterfaceConfig>) -> Result<(), Error>;
|
||||
|
||||
/// Creates a new VLAN on the switch.
|
||||
async fn create_vlan(&self, vlan: &Vlan) -> Result<(), Error>;
|
||||
|
||||
/// Deletes a VLAN from the switch.
|
||||
async fn delete_vlan(&self, vlan: &Vlan) -> Result<(), Error>;
|
||||
|
||||
/// Scans the existing configuration to find the next available (unused)
|
||||
/// Port-Channel ID (`lag` or `trunk`) for assignment.
|
||||
@@ -246,6 +318,9 @@ pub trait BrocadeClient: std::fmt::Debug {
|
||||
/// * `des`: The Data Encryption Standard algorithm key
|
||||
async fn enable_snmp(&self, user_name: &str, auth: &str, des: &str) -> Result<(), Error>;
|
||||
|
||||
/// Resets an interface to its default state by removing switchport configuration.
|
||||
async fn reset_interface(&self, interface: &str) -> Result<(), Error>;
|
||||
|
||||
/// Removes all configuration associated with the specified Port-Channel name.
|
||||
///
|
||||
/// This operation should be idempotent; attempting to clear a non-existent
|
||||
|
||||
@@ -6,9 +6,10 @@ use log::{debug, info};
|
||||
use regex::Regex;
|
||||
|
||||
use crate::{
|
||||
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo,
|
||||
InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode,
|
||||
SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell,
|
||||
BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceConfig,
|
||||
InterfaceInfo, InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId,
|
||||
PortOperatingMode, SwitchInterface, Vlan, VlanList, parse_brocade_mac_address,
|
||||
shell::BrocadeShell,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -84,8 +85,8 @@ impl NetworkOperatingSystemClient {
|
||||
}
|
||||
|
||||
let interface_type = match parts[0] {
|
||||
"Fo" => InterfaceType::Ethernet("FortyGigabitEthernet".to_string()),
|
||||
"Te" => InterfaceType::Ethernet("TenGigabitEthernet".to_string()),
|
||||
"Fo" => InterfaceType::FortyGigabitEthernet,
|
||||
"Te" => InterfaceType::TenGigabitEthernet,
|
||||
_ => return None,
|
||||
};
|
||||
let port_location = PortLocation::from_str(parts[1]).ok()?;
|
||||
@@ -185,18 +186,20 @@ impl BrocadeClient for NetworkOperatingSystemClient {
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn configure_interfaces(
|
||||
&self,
|
||||
interfaces: &Vec<(String, PortOperatingMode)>,
|
||||
) -> Result<(), Error> {
|
||||
async fn configure_interfaces(&self, interfaces: &Vec<InterfaceConfig>) -> Result<(), Error> {
|
||||
info!("[Brocade] Configuring {} interface(s)...", interfaces.len());
|
||||
|
||||
let mut commands = vec!["configure terminal".to_string()];
|
||||
|
||||
for interface in interfaces {
|
||||
commands.push(format!("interface {}", interface.0));
|
||||
debug!(
|
||||
"[Brocade] Configuring interface {} as {:?}",
|
||||
interface.interface, interface.mode
|
||||
);
|
||||
|
||||
match interface.1 {
|
||||
commands.push(format!("interface {}", interface.interface));
|
||||
|
||||
match interface.mode {
|
||||
PortOperatingMode::Fabric => {
|
||||
commands.push("fabric isl enable".into());
|
||||
commands.push("fabric trunk enable".into());
|
||||
@@ -204,23 +207,50 @@ impl BrocadeClient for NetworkOperatingSystemClient {
|
||||
PortOperatingMode::Trunk => {
|
||||
commands.push("switchport".into());
|
||||
commands.push("switchport mode trunk".into());
|
||||
commands.push("switchport trunk allowed vlan all".into());
|
||||
match &interface.trunk_vlans {
|
||||
Some(VlanList::All) => {
|
||||
commands.push("switchport trunk allowed vlan all".into());
|
||||
}
|
||||
Some(VlanList::Specific(vlans)) => {
|
||||
for vlan in vlans {
|
||||
commands.push(format!("switchport trunk allowed vlan add {}", vlan.id));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
commands.push("switchport trunk allowed vlan all".into());
|
||||
}
|
||||
}
|
||||
commands.push("no switchport trunk tag native-vlan".into());
|
||||
commands.push("spanning-tree shutdown".into());
|
||||
commands.push("no fabric isl enable".into());
|
||||
commands.push("no fabric trunk enable".into());
|
||||
commands.push("no shutdown".into());
|
||||
if matches!(interface.interface, SwitchInterface::Ethernet(..)) {
|
||||
commands.push("spanning-tree shutdown".into());
|
||||
commands.push("no fabric isl enable".into());
|
||||
commands.push("no fabric trunk enable".into());
|
||||
}
|
||||
}
|
||||
PortOperatingMode::Access => {
|
||||
commands.push("switchport".into());
|
||||
commands.push("switchport mode access".into());
|
||||
commands.push("switchport access vlan 1".into());
|
||||
commands.push("no spanning-tree shutdown".into());
|
||||
commands.push("no fabric isl enable".into());
|
||||
commands.push("no fabric trunk enable".into());
|
||||
let access_vlan = interface.access_vlan.unwrap_or(1);
|
||||
commands.push(format!("switchport access vlan {access_vlan}"));
|
||||
if matches!(interface.interface, SwitchInterface::Ethernet(..)) {
|
||||
commands.push("no spanning-tree shutdown".into());
|
||||
commands.push("no fabric isl enable".into());
|
||||
commands.push("no fabric trunk enable".into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(speed) = &interface.speed {
|
||||
info!(
|
||||
"[Brocade] Overriding speed on {} to {speed}",
|
||||
interface.interface
|
||||
);
|
||||
if matches!(interface.interface, SwitchInterface::PortChannel(..)) {
|
||||
commands.push("shutdown".into());
|
||||
}
|
||||
commands.push(format!("speed {speed}"));
|
||||
}
|
||||
|
||||
commands.push("no shutdown".into());
|
||||
commands.push("exit".into());
|
||||
}
|
||||
@@ -235,6 +265,40 @@ impl BrocadeClient for NetworkOperatingSystemClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_vlan(&self, vlan: &Vlan) -> Result<(), Error> {
|
||||
info!("[Brocade] Creating VLAN {} ({})", vlan.id, vlan.name);
|
||||
|
||||
let commands = vec![
|
||||
"configure terminal".into(),
|
||||
format!("interface Vlan {}", vlan.id),
|
||||
format!("name {}", vlan.name),
|
||||
"exit".into(),
|
||||
];
|
||||
|
||||
self.shell
|
||||
.run_commands(commands, ExecutionMode::Regular)
|
||||
.await?;
|
||||
|
||||
info!("[Brocade] VLAN {} ({}) created.", vlan.id, vlan.name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_vlan(&self, vlan: &Vlan) -> Result<(), Error> {
|
||||
info!("[Brocade] Deleting VLAN {}", vlan.id);
|
||||
|
||||
let commands = vec![
|
||||
"configure terminal".into(),
|
||||
format!("no interface Vlan {}", vlan.id),
|
||||
];
|
||||
|
||||
self.shell
|
||||
.run_commands(commands, ExecutionMode::Regular)
|
||||
.await?;
|
||||
|
||||
info!("[Brocade] VLAN {} deleted.", vlan.id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_available_channel_id(&self) -> Result<PortChannelId, Error> {
|
||||
info!("[Brocade] Finding next available channel id...");
|
||||
|
||||
@@ -283,22 +347,20 @@ impl BrocadeClient for NetworkOperatingSystemClient {
|
||||
.join(", ")
|
||||
);
|
||||
|
||||
let interfaces = self.get_interfaces().await?;
|
||||
|
||||
let mut commands = vec![
|
||||
"configure terminal".into(),
|
||||
format!("interface port-channel {}", channel_id),
|
||||
"no shutdown".into(),
|
||||
format!("description {channel_name}"),
|
||||
"exit".into(),
|
||||
];
|
||||
|
||||
for port in ports {
|
||||
let interface = interfaces.iter().find(|i| i.port_location == *port);
|
||||
let Some(interface) = interface else {
|
||||
continue;
|
||||
};
|
||||
|
||||
commands.push(format!("interface {}", interface.name));
|
||||
debug!(
|
||||
"[Brocade] Adding port TenGigabitEthernet {} to channel-group {}",
|
||||
port, channel_id
|
||||
);
|
||||
commands.push(format!("interface TenGigabitEthernet {}", port));
|
||||
commands.push("no switchport".into());
|
||||
commands.push("no ip address".into());
|
||||
commands.push("no fabric isl enable".into());
|
||||
@@ -317,6 +379,25 @@ impl BrocadeClient for NetworkOperatingSystemClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn reset_interface(&self, interface: &str) -> Result<(), Error> {
|
||||
info!("[Brocade] Resetting interface: {interface}");
|
||||
|
||||
let commands = vec![
|
||||
"configure terminal".into(),
|
||||
format!("interface {interface}"),
|
||||
"no switchport".into(),
|
||||
"no speed".into(),
|
||||
"exit".into(),
|
||||
];
|
||||
|
||||
self.shell
|
||||
.run_commands(commands, ExecutionMode::Regular)
|
||||
.await?;
|
||||
|
||||
info!("[Brocade] Interface '{interface}' reset.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn clear_port_channel(&self, channel_name: &str) -> Result<(), Error> {
|
||||
info!("[Brocade] Clearing port-channel: {channel_name}");
|
||||
|
||||
|
||||
11
build/book.sh
Executable file
11
build/book.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
cargo install mdbook --locked
|
||||
mdbook build
|
||||
|
||||
test -f book/index.html || (echo "ERROR: book/index.html not found" && exit 1)
|
||||
test -f book/concepts.html || (echo "ERROR: book/concepts.html not found" && exit 1)
|
||||
test -f book/guides/getting-started.html || (echo "ERROR: book/guides/getting-started.html not found" && exit 1)
|
||||
@@ -1,6 +1,11 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
git submodule init
|
||||
git submodule update
|
||||
|
||||
rustc --version
|
||||
cargo check --all-targets --all-features --keep-going
|
||||
cargo fmt --check
|
||||
16
build/ci.sh
Executable file
16
build/ci.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.."
|
||||
|
||||
BRANCH="${1:-main}"
|
||||
|
||||
echo "=== Running CI for branch: $BRANCH ==="
|
||||
|
||||
echo "--- Checking code ---"
|
||||
./build/check.sh
|
||||
|
||||
echo "--- Building book ---"
|
||||
./build/book.sh
|
||||
|
||||
echo "=== CI passed ==="
|
||||
@@ -13,8 +13,8 @@ If you're new to Harmony, start here:
|
||||
|
||||
See how to use Harmony to solve real-world problems.
|
||||
|
||||
- [**PostgreSQL on Local K3D**](./use-cases/postgresql-on-local-k3d.md): Deploy a production-grade PostgreSQL cluster on a local K3D cluster. The fastest way to get started.
|
||||
- [**OKD on Bare Metal**](./use-cases/okd-on-bare-metal.md): A detailed walkthrough of bootstrapping a high-availability OKD cluster from physical hardware.
|
||||
- [**Deploy a Rust Web App**](./use-cases/deploy-rust-webapp.md): A quick guide to deploying a monitored, containerized web application to a Kubernetes cluster.
|
||||
|
||||
## 3. Component Catalogs
|
||||
|
||||
@@ -31,3 +31,7 @@ Ready to build your own components? These guides show you how.
|
||||
- [**Writing a Score**](./guides/writing-a-score.md): Learn how to create your own `Score` and `Interpret` logic to define a new desired state.
|
||||
- [**Writing a Topology**](./guides/writing-a-topology.md): Learn how to model a new environment (like AWS, GCP, or custom hardware) as a `Topology`.
|
||||
- [**Adding Capabilities**](./guides/adding-capabilities.md): See how to add a `Capability` to your custom `Topology`.
|
||||
|
||||
## 5. Architecture Decision Records
|
||||
|
||||
Harmony's design is documented through Architecture Decision Records (ADRs). See the [ADR Overview](./adr/README.md) for a complete index of all decisions.
|
||||
|
||||
53
docs/SUMMARY.md
Normal file
53
docs/SUMMARY.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Summary
|
||||
|
||||
[Harmony Documentation](./README.md)
|
||||
|
||||
- [Core Concepts](./concepts.md)
|
||||
- [Getting Started Guide](./guides/getting-started.md)
|
||||
|
||||
## Use Cases
|
||||
|
||||
- [PostgreSQL on Local K3D](./use-cases/postgresql-on-local-k3d.md)
|
||||
- [OKD on Bare Metal](./use-cases/okd-on-bare-metal.md)
|
||||
|
||||
## Component Catalogs
|
||||
|
||||
- [Scores Catalog](./catalogs/scores.md)
|
||||
- [Topologies Catalog](./catalogs/topologies.md)
|
||||
- [Capabilities Catalog](./catalogs/capabilities.md)
|
||||
|
||||
## Developer Guides
|
||||
|
||||
- [Developer Guide](./guides/developer-guide.md)
|
||||
- [Writing a Score](./guides/writing-a-score.md)
|
||||
- [Writing a Topology](./guides/writing-a-topology.md)
|
||||
- [Adding Capabilities](./guides/adding-capabilities.md)
|
||||
|
||||
## Configuration
|
||||
|
||||
- [Configuration](./concepts/configuration.md)
|
||||
|
||||
## Architecture Decision Records
|
||||
|
||||
- [ADR Overview](./adr/README.md)
|
||||
- [000 · ADR Template](./adr/000-ADR-Template.md)
|
||||
- [001 · Why Rust](./adr/001-rust.md)
|
||||
- [002 · Hexagonal Architecture](./adr/002-hexagonal-architecture.md)
|
||||
- [003 · Infrastructure Abstractions](./adr/003-infrastructure-abstractions.md)
|
||||
- [004 · iPXE](./adr/004-ipxe.md)
|
||||
- [005 · Interactive Project](./adr/005-interactive-project.md)
|
||||
- [006 · Secret Management](./adr/006-secret-management.md)
|
||||
- [007 · Default Runtime](./adr/007-default-runtime.md)
|
||||
- [008 · Score Display Formatting](./adr/008-score-display-formatting.md)
|
||||
- [009 · Helm and Kustomize Handling](./adr/009-helm-and-kustomize-handling.md)
|
||||
- [010 · Monitoring and Alerting](./adr/010-monitoring-and-alerting.md)
|
||||
- [011 · Multi-Tenant Cluster](./adr/011-multi-tenant-cluster.md)
|
||||
- [012 · Project Delivery Automation](./adr/012-project-delivery-automation.md)
|
||||
- [013 · Monitoring Notifications](./adr/013-monitoring-notifications.md)
|
||||
- [015 · Higher Order Topologies](./adr/015-higher-order-topologies.md)
|
||||
- [016 · Harmony Agent and Global Mesh](./adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md)
|
||||
- [017-1 · NATS Clusters Interconnection](./adr/017-1-Nats-Clusters-Interconnection-Topology.md)
|
||||
- [018 · Template Hydration for Workload Deployment](./adr/018-Template-Hydration-For-Workload-Deployment.md)
|
||||
- [019 · Network Bond Setup](./adr/019-Network-bond-setup.md)
|
||||
- [020 · Interactive Configuration Crate](./adr/020-interactive-configuration-crate.md)
|
||||
- [020-1 · Zitadel + OpenBao Secure Config Store](./adr/020-1-zitadel-openbao-secure-config-store.md)
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
Rejected : See ADR 020 ./020-interactive-configuration-crate.md
|
||||
|
||||
### TODO [#3](https://git.nationtech.io/NationTech/harmony/issues/3):
|
||||
|
||||
238
docs/adr/017-2-reviewed-staleness-detection-algorithm.md
Normal file
238
docs/adr/017-2-reviewed-staleness-detection-algorithm.md
Normal file
@@ -0,0 +1,238 @@
|
||||
Here are some rough notes on the previous design :
|
||||
|
||||
- We found an issue where there could be primary flapping when network latency is larger than the primary self fencing timeout.
|
||||
- e.g. network latency to get nats ack is 30 seconds (extreme but can happen), and self-fencing happens after 50 seconds. Then at second 50 self-fencing would occur, and then at second 60 ack comes in. At this point we reject the ack as already failed because of timeout. Self fencing happens. But then network latency comes back down to 5 seconds and lets one successful heartbeat through, this means the primary comes back to healthy, and the same thing repeats, so the primary flaps.
|
||||
- At least this does not cause split brain since the replica never times out and wins the leadership write since we validate strict write ordering and we force consensus on writes.
|
||||
|
||||
Also, we were seeing that the implementation became more complex. There is a lot of timers to handle and that becomes hard to reason about for edge cases.
|
||||
|
||||
So, we came up with a slightly different approach, inspired by k8s liveness probes.
|
||||
|
||||
We now want to use a failure and success threshold counter . However, on the replica side, all we can do is use a timer. The timer we can use is time since last primary heartbeat jetstream metadata timestamp. We could also try and mitigate clock skew by measuring time between internal clock and jetstream metadata timestamp when writing our own heartbeat (not for now, but worth thinking about, though I feel like it is useless).
|
||||
|
||||
So the current working design is this :
|
||||
|
||||
configure :
|
||||
- number of consecutive success to mark the node as UP
|
||||
- number of consecutive failures to mark the node as DOWN
|
||||
- note that success/failure must be consecutive. One success in a row of failures is enough to keep service up. This allows for various configuration profiles, from very stict availability to very lenient depending on the number of failure tolerated and success required to keep the service up.
|
||||
- failure_threshold at 100 will let a service fail (or timeout) 99/100 and stay up
|
||||
- success_threshold at 100 will not bring back up a service until it has succeeded 100 heartbeat in a row
|
||||
- failure threshold at 1 will fail the service at the slightest network latency spike/packet loss
|
||||
- success threshold at 1 will bring the service up very quickly and may cause flapping in unstable network conditions
|
||||
|
||||
|
||||
```
|
||||
# heartbeat session log
|
||||
# failure threshold : 3
|
||||
# success threshold : 2
|
||||
|
||||
STATUS UP :
|
||||
t=1 probe : fail f=1 s=0
|
||||
t=2 probe : fail : f=2 s=0
|
||||
t=3 probe : ok f=0 s=1
|
||||
t=4 probe : fail f=1 s=0
|
||||
```
|
||||
|
||||
Scenario :
|
||||
|
||||
failure threshold = 2
|
||||
heartbeat timeout = 1s
|
||||
total before fencing = 2 * 1 = 2s
|
||||
|
||||
staleness detection timer = 2*total before fencing
|
||||
|
||||
can we do this simple multiplication that staleness detection timer (time the replica waits since the last primary heartbeat before promoting itself) is double the time the replica will take before starting the fencing process.
|
||||
|
||||
---
|
||||
|
||||
### Context
|
||||
We are designing a **Staleness-Based Failover Algorithm** for the Harmony Agent. The goal is to manage High Availability (HA) for stateful workloads (like PostgreSQL) across decentralized, variable-quality networks ("Micro Data Centers").
|
||||
|
||||
We are moving away from complex, synchronized clocks in favor of a **Counter-Based Liveness** approach (inspired by Kubernetes probes) for the Primary, and a **Time-Based Watchdog** for the Replica.
|
||||
|
||||
### 1. The Algorithm
|
||||
|
||||
#### The Primary (Self-Health & Fencing)
|
||||
The Primary validates its own "License to Operate" via a heartbeat loop.
|
||||
* **Loop:** Every `heartbeat_interval` (e.g., 1s), it attempts to write a heartbeat to NATS and check the local DB.
|
||||
* **Counters:** It maintains `consecutive_failures` and `consecutive_successes`.
|
||||
* **State Transition:**
|
||||
* **To UNHEALTHY:** If `consecutive_failures >= failure_threshold`, the Primary **Fences Self** (stops DB, releases locks).
|
||||
* **To HEALTHY:** If `consecutive_successes >= success_threshold`, the Primary **Un-fences** (starts DB, acquires locks).
|
||||
* **Reset Logic:** A single success resets the failure counter to 0, and vice versa.
|
||||
|
||||
#### The Replica (Staleness Detection)
|
||||
The Replica acts as a passive watchdog observing the NATS stream.
|
||||
* **Calculation:** It calculates a `MaxStaleness` timeout.
|
||||
$$ \text{MaxStaleness} = (\text{failure\_threshold} \times \text{heartbeat\_interval}) \times \text{SafetyMultiplier} $$
|
||||
*(We use a SafetyMultiplier of 2 to ensure the Primary has definitely fenced itself before we take over).*
|
||||
* **Action:** If `Time.now() - LastPrimaryHeartbeat > MaxStaleness`, the Replica assumes the Primary is dead and **Promotes Self**.
|
||||
|
||||
---
|
||||
|
||||
### 2. Configuration Trade-offs
|
||||
|
||||
The separation of `success` and `failure` thresholds allows us to tune the "personality" of the cluster.
|
||||
|
||||
#### Scenario A: The "Nervous" Cluster (High Sensitivity)
|
||||
* **Config:** `failure_threshold: 1`, `success_threshold: 1`
|
||||
* **Behavior:** Fails over immediately upon a single missed packet or slow disk write.
|
||||
* **Pros:** Maximum availability for perfect networks.
|
||||
* **Cons:** **High Flapping Risk.** In a residential network, a microwave turning on might cause a failover.
|
||||
|
||||
#### Scenario B: The "Tank" Cluster (High Stability)
|
||||
* **Config:** `failure_threshold: 10`, `success_threshold: 1`
|
||||
* **Behavior:** The node must be consistently broken for 10 seconds (assuming 1s interval) to give up.
|
||||
* **Pros:** Extremely stable on bad networks (e.g., Starlink, 4G). Ignores transient spikes.
|
||||
* **Cons:** **Slow Failover.** Users experience 10+ seconds of downtime before the Replica even *thinks* about taking over.
|
||||
|
||||
#### Scenario C: The "Sticky" Cluster (Hysteresis)
|
||||
* **Config:** `failure_threshold: 5`, `success_threshold: 5`
|
||||
* **Behavior:** Hard to kill, hard to bring back.
|
||||
* **Pros:** Prevents "Yo-Yo" effects. If a node fails, it must prove it is *really* stable (5 clean checks in a row) before re-joining the cluster.
|
||||
|
||||
---
|
||||
|
||||
### 3. Failure Modes & Behavior Analysis
|
||||
|
||||
Here is how the algorithm handles specific edge cases:
|
||||
|
||||
#### Case 1: Immediate Outage (Power Cut / Kernel Panic)
|
||||
* **Event:** Primary vanishes instantly. No more writes to NATS.
|
||||
* **Primary:** Does nothing (it's dead).
|
||||
* **Replica:** Sees the `LastPrimaryHeartbeat` timestamp age. Once it crosses `MaxStaleness`, it promotes itself.
|
||||
* **Outcome:** Clean failover after the timeout duration.
|
||||
|
||||
#### Case 2: Network Instability (Packet Loss / Jitter)
|
||||
* **Event:** The Primary fails to write to NATS for 2 cycles due to Wi-Fi interference, then succeeds on the 3rd.
|
||||
* **Config:** `failure_threshold: 5`.
|
||||
* **Primary:**
|
||||
* $t=1$: Fail (Counter=1)
|
||||
* $t=2$: Fail (Counter=2)
|
||||
* $t=3$: Success (Counter resets to 0). **State remains HEALTHY.**
|
||||
* **Replica:** Sees a gap in heartbeats but the timestamp never exceeds `MaxStaleness`.
|
||||
* **Outcome:** No downtime, no failover. The system correctly identified this as noise, not failure.
|
||||
|
||||
#### Case 3: High Latency (The "Slow Death")
|
||||
* **Event:** Primary is under heavy load; heartbeats take 1.5s to complete (interval is 1s).
|
||||
* **Primary:** The `timeout` on the heartbeat logic triggers. `consecutive_failures` rises. Eventually, it hits `failure_threshold` and fences itself to prevent data corruption.
|
||||
* **Replica:** Sees the heartbeats stop (or arrive too late). The timestamp ages out.
|
||||
* **Outcome:** Primary fences self -> Replica waits for safety buffer -> Replica promotes. **Split-brain is avoided** because the Primary killed itself *before* the Replica acted (due to the SafetyMultiplier).
|
||||
|
||||
#### Case 4: Replica Network Partition
|
||||
* **Event:** Replica loses internet connection; Primary is fine.
|
||||
* **Replica:** Sees `LastPrimaryHeartbeat` age out (because it can't reach NATS). It *wants* to promote itself.
|
||||
* **Constraint:** To promote, the Replica must write to NATS. Since it is partitioned, the NATS write fails.
|
||||
* **Outcome:** The Replica remains in Standby (or fails to promote). The Primary continues serving traffic. **Cluster integrity is preserved.**
|
||||
|
||||
|
||||
----
|
||||
|
||||
|
||||
### Context & Use Case
|
||||
We are implementing a High Availability (HA) Failover Strategy for decentralized "Micro Data Centers." The core challenge is managing stateful workloads (PostgreSQL) over unreliable networks.
|
||||
|
||||
We solve this using a **Local Fencing First** approach, backed by **NATS JetStream Strict Ordering** for the final promotion authority.
|
||||
|
||||
In CAP theorem terms, we are developing a CP system, intentionally sacrificing availability. In practical terms, we expect an average of two primary outages per year, with a failover delay of around 2 minutes. This translates to an uptime of over five nines. To be precise, 2 outages * 2 minutes = 4 minutes per year = 99.99924% uptime.
|
||||
|
||||
### The Algorithm: Local Fencing & Remote Promotion
|
||||
|
||||
The safety (data consistency) of the system relies on the time gap between the **Primary giving up (Fencing)** and the **Replica taking over (Promotion)**.
|
||||
|
||||
To avoid clock skew issues between agents and datastore (nats), all timestamps comparisons will be done using jetstream metadata. I.E. a harmony agent will never use `Instant::now()` to get a timestamp, it will use `my_last_heartbeat.metadata.timestamp` (conceptually).
|
||||
|
||||
#### 1. Configuration
|
||||
* `heartbeat_timeout` (e.g., 1s): Max time allowed for a NATS write/DB check.
|
||||
* `failure_threshold` (e.g., 2): Consecutive failures before self-fencing.
|
||||
* `failover_timeout` (e.g., 5s): Time since last NATS update of Primary heartbeat before Replica promotes.
|
||||
* This timeout must be carefully configured to allow enough time for the primary to fence itself (after `heartbeat_timeout * failure_threshold`) BEFORE the replica gets promoted to avoid a split brain with two primaries.
|
||||
* Implementing this will rely on the actual deployment configuration. For example, a CNPG based PostgreSQL cluster might require a longer gap (such as 30s) than other technologies.
|
||||
* Expires when `replica_heartbeat.metadata.timestamp - primary_heartbeat.metadata.timestamp > failover_timeout`
|
||||
|
||||
#### 2. The Primary (Self-Preservation)
|
||||
|
||||
The Primary is aggressive about killing itself.
|
||||
|
||||
* It attempts a heartbeat.
|
||||
* If the network latency > `heartbeat_timeout`, the attempt is **cancelled locally** because the heartbeat did not make it back in time.
|
||||
* This counts as a failure and increments the `consecutive_failures` counter.
|
||||
* If `consecutive_failures` hit the threshold, **FENCING occurs immediately**. The database is stopped.
|
||||
|
||||
This means that the Primary will fence itself after `heartbeat_timeout * failure_threshold`.
|
||||
|
||||
#### 3. The Replica (The Watchdog)
|
||||
|
||||
The Replica is patient.
|
||||
|
||||
* It watches the NATS stream to measure if `replica_heartbeat.metadata.timestamp - primary_heartbeat.metadata.timestamp > failover_timeout`
|
||||
* It only attempts promotion if the `failover_timeout` (5s) has passed.
|
||||
* **Crucial:** Careful configuration of the failover_timeout is required. This is the only way to avoid a split brain in case of a network partition where the Primary cannot write its heartbeats in time anymore.
|
||||
* In short, `failover_timeout` should be tuned to be `heartbeat_timeout * failure_threshold + safety_margin`. This `safety_margin` will vary by use case. For example, a CNPG cluster may need 30 seconds to demote a Primary to Replica when fencing is triggered, so `safety_margin` should be at least 30s in that setup.
|
||||
|
||||
Since we forcibly fail timeouts after `heartbeat_timeout`, we are guaranteed that the primary will have **started** the fencing process after `heartbeat_timeout * failure_threshold`.
|
||||
|
||||
But, in a network split scenario where the failed primary is still accessible by clients but cannot write its heartbeat successfully, there is no way to know if the demotion has actually **completed**.
|
||||
|
||||
For example, in a CNPG cluster, the failed Primary agent will attempt to change the CNPG cluster state to read-only. But if anything fails after that attempt (permission error, k8s api failure, CNPG bug, etc) it is possible that the PostgreSQL instance keeps accepting writes.
|
||||
|
||||
While this is not a theoretical failure of the agent's algorithm, this is a practical failure where data corruption occurs.
|
||||
|
||||
This can be fixed by detecting the demotion failure and escalating the fencing procedure aggressiveness. Harmony being an infrastructure orchestrator, it can easily exert radical measures if given the proper credentials, such as forcibly powering off a server, disconnecting its network in the switch configuration, forcibly kill a pod/container/process, etc.
|
||||
|
||||
However, these details are out of scope of this algorithm, as they simply fall under the "fencing procedure".
|
||||
|
||||
The implementation of the fencing procedure itself is not relevant. This algorithm's responsibility stops at calling the fencing procedure in the appropriate situation.
|
||||
|
||||
#### 4. The Demotion Handshake (Return to Normalcy)
|
||||
|
||||
When the original Primary recovers:
|
||||
|
||||
1. It becomes healthy locally but sees `current_primary = Replica`. It waits.
|
||||
2. The Replica (current leader) detects the Original Primary is back (via NATS heartbeats).
|
||||
3. Replica performs a **Clean Demotion**:
|
||||
* Stops DB.
|
||||
* Writes `current_primary = None` to NATS.
|
||||
4. Original Primary sees `current_primary = None` and can launch the promotion procedure.
|
||||
|
||||
Depending on the implementation, the promotion procedure may require a transition phase. Typically, for a PostgreSQL use case the promoting primary will make sure it has caught up on WAL replication before starting to accept writes.
|
||||
|
||||
---
|
||||
|
||||
### Failure Modes & Behavior Analysis
|
||||
|
||||
#### Case 1: Immediate Outage (Power Cut)
|
||||
|
||||
* **Primary:** Dies instantly. Fencing is implicit (machine is off).
|
||||
* **Replica:** Waits for `failover_timeout` (5s). Sees staleness. Promotes self.
|
||||
* **Outcome:** Clean failover after 5s.
|
||||
|
||||
// TODO detail what happens when the primary comes back up. We will likely have to tie PostgreSQL's lifecycle (liveness/readiness probes) with the agent to ensure it does not come back up as primary.
|
||||
|
||||
#### Case 2: High Network Latency on the Primary (The "Split Brain" Trap)
|
||||
|
||||
* **Scenario:** Network latency spikes to 5s on the Primary, still below `heartbeat_timeout` on the Replica.
|
||||
* **T=0 to T=2 (Primary):** Tries to write. Latency (5s) > Timeout (1s). Fails twice.
|
||||
* **T=2 (Primary):** `consecutive_failures` = 2. **Primary Fences Self.** (Service is DOWN).
|
||||
* **T=2 to T=5 (Cluster):** **Read-Only Phase.** No Primary exists.
|
||||
* **T=5 (Replica):** `failover_timeout` reached. Replica promotes self.
|
||||
* **Outcome:** Safe failover. The "Read-Only Gap" (T=2 to T=5) ensures no Split Brain occurred.
|
||||
|
||||
#### Case 3: Replica Network Lag (False Positive)
|
||||
|
||||
* **Scenario:** Replica has high latency, greater than `failover_timeout`; Primary is fine.
|
||||
* **Replica:** Thinks Primary is dead. Tries to promote by setting `cluster_state.current_primary = replica_id`.
|
||||
* **NATS:** Rejects the write because the Primary is still updating the sequence numbers successfully.
|
||||
* **Outcome:** Promotion denied. Primary stays leader.
|
||||
|
||||
#### Case 4: Network Instability (Flapping)
|
||||
|
||||
* **Scenario:** Intermittent packet loss.
|
||||
* **Primary:** Fails 1 heartbeat, succeeds the next. `consecutive_failures` resets.
|
||||
* **Replica:** Sees a slight delay in updates, but never reaches `failover_timeout`.
|
||||
* **Outcome:** No Fencing, No Promotion. System rides out the noise.
|
||||
|
||||
## Contextual notes
|
||||
|
||||
* Clock skew : Tokio relies on monotonic clocks. This means that `tokio::time::sleep(...)` will not be affected by system clock corrections (such as NTP). But monotonic clocks are known to jump forward in some cases such as VM live migrations. This could mean a false timeout of a single heartbeat. If `failure_threshold = 1`, this can mean a false negative on the nodes' health, and a potentially useless demotion.
|
||||
107
docs/adr/017-3-revised-staleness-inspired-by-kubernetes.md
Normal file
107
docs/adr/017-3-revised-staleness-inspired-by-kubernetes.md
Normal file
@@ -0,0 +1,107 @@
|
||||
### Context & Use Case
|
||||
We are implementing a High Availability (HA) Failover Strategy for decentralized "Micro Data Centers." The core challenge is managing stateful workloads (PostgreSQL) over unreliable networks.
|
||||
|
||||
We solve this using a **Local Fencing First** approach, backed by **NATS JetStream Strict Ordering** for the final promotion authority.
|
||||
|
||||
In CAP theorem terms, we are developing a CP system, intentionally sacrificing availability. In practical terms, we expect an average of two primary outages per year, with a failover delay of around 2 minutes. This translates to an uptime of over five nines. To be precise, 2 outages * 2 minutes = 4 minutes per year = 99.99924% uptime.
|
||||
|
||||
### The Algorithm: Local Fencing & Remote Promotion
|
||||
|
||||
The safety (data consistency) of the system relies on the time gap between the **Primary giving up (Fencing)** and the **Replica taking over (Promotion)**.
|
||||
|
||||
To avoid clock skew issues between agents and datastore (nats), all timestamps comparisons will be done using jetstream metadata. I.E. a harmony agent will never use `Instant::now()` to get a timestamp, it will use `my_last_heartbeat.metadata.timestamp` (conceptually).
|
||||
|
||||
#### 1. Configuration
|
||||
* `heartbeat_timeout` (e.g., 1s): Max time allowed for a NATS write/DB check.
|
||||
* `failure_threshold` (e.g., 2): Consecutive failures before self-fencing.
|
||||
* `failover_timeout` (e.g., 5s): Time since last NATS update of Primary heartbeat before Replica promotes.
|
||||
* This timeout must be carefully configured to allow enough time for the primary to fence itself (after `heartbeat_timeout * failure_threshold`) BEFORE the replica gets promoted to avoid a split brain with two primaries.
|
||||
* Implementing this will rely on the actual deployment configuration. For example, a CNPG based PostgreSQL cluster might require a longer gap (such as 30s) than other technologies.
|
||||
* Expires when `replica_heartbeat.metadata.timestamp - primary_heartbeat.metadata.timestamp > failover_timeout`
|
||||
|
||||
#### 2. The Primary (Self-Preservation)
|
||||
|
||||
The Primary is aggressive about killing itself.
|
||||
|
||||
* It attempts a heartbeat.
|
||||
* If the network latency > `heartbeat_timeout`, the attempt is **cancelled locally** because the heartbeat did not make it back in time.
|
||||
* This counts as a failure and increments the `consecutive_failures` counter.
|
||||
* If `consecutive_failures` hit the threshold, **FENCING occurs immediately**. The database is stopped.
|
||||
|
||||
This means that the Primary will fence itself after `heartbeat_timeout * failure_threshold`.
|
||||
|
||||
#### 3. The Replica (The Watchdog)
|
||||
|
||||
The Replica is patient.
|
||||
|
||||
* It watches the NATS stream to measure if `replica_heartbeat.metadata.timestamp - primary_heartbeat.metadata.timestamp > failover_timeout`
|
||||
* It only attempts promotion if the `failover_timeout` (5s) has passed.
|
||||
* **Crucial:** Careful configuration of the failover_timeout is required. This is the only way to avoid a split brain in case of a network partition where the Primary cannot write its heartbeats in time anymore.
|
||||
* In short, `failover_timeout` should be tuned to be `heartbeat_timeout * failure_threshold + safety_margin`. This `safety_margin` will vary by use case. For example, a CNPG cluster may need 30 seconds to demote a Primary to Replica when fencing is triggered, so `safety_margin` should be at least 30s in that setup.
|
||||
|
||||
Since we forcibly fail timeouts after `heartbeat_timeout`, we are guaranteed that the primary will have **started** the fencing process after `heartbeat_timeout * failure_threshold`.
|
||||
|
||||
But, in a network split scenario where the failed primary is still accessible by clients but cannot write its heartbeat successfully, there is no way to know if the demotion has actually **completed**.
|
||||
|
||||
For example, in a CNPG cluster, the failed Primary agent will attempt to change the CNPG cluster state to read-only. But if anything fails after that attempt (permission error, k8s api failure, CNPG bug, etc) it is possible that the PostgreSQL instance keeps accepting writes.
|
||||
|
||||
While this is not a theoretical failure of the agent's algorithm, this is a practical failure where data corruption occurs.
|
||||
|
||||
This can be fixed by detecting the demotion failure and escalating the fencing procedure aggressiveness. Harmony being an infrastructure orchestrator, it can easily exert radical measures if given the proper credentials, such as forcibly powering off a server, disconnecting its network in the switch configuration, forcibly kill a pod/container/process, etc.
|
||||
|
||||
However, these details are out of scope of this algorithm, as they simply fall under the "fencing procedure".
|
||||
|
||||
The implementation of the fencing procedure itself is not relevant. This algorithm's responsibility stops at calling the fencing procedure in the appropriate situation.
|
||||
|
||||
#### 4. The Demotion Handshake (Return to Normalcy)
|
||||
|
||||
When the original Primary recovers:
|
||||
|
||||
1. It becomes healthy locally but sees `current_primary = Replica`. It waits.
|
||||
2. The Replica (current leader) detects the Original Primary is back (via NATS heartbeats).
|
||||
3. Replica performs a **Clean Demotion**:
|
||||
* Stops DB.
|
||||
* Writes `current_primary = None` to NATS.
|
||||
4. Original Primary sees `current_primary = None` and can launch the promotion procedure.
|
||||
|
||||
Depending on the implementation, the promotion procedure may require a transition phase. Typically, for a PostgreSQL use case the promoting primary will make sure it has caught up on WAL replication before starting to accept writes.
|
||||
|
||||
---
|
||||
|
||||
### Failure Modes & Behavior Analysis
|
||||
|
||||
#### Case 1: Immediate Outage (Power Cut)
|
||||
|
||||
* **Primary:** Dies instantly. Fencing is implicit (machine is off).
|
||||
* **Replica:** Waits for `failover_timeout` (5s). Sees staleness. Promotes self.
|
||||
* **Outcome:** Clean failover after 5s.
|
||||
|
||||
// TODO detail what happens when the primary comes back up. We will likely have to tie PostgreSQL's lifecycle (liveness/readiness probes) with the agent to ensure it does not come back up as primary.
|
||||
|
||||
#### Case 2: High Network Latency on the Primary (The "Split Brain" Trap)
|
||||
|
||||
* **Scenario:** Network latency spikes to 5s on the Primary, still below `heartbeat_timeout` on the Replica.
|
||||
* **T=0 to T=2 (Primary):** Tries to write. Latency (5s) > Timeout (1s). Fails twice.
|
||||
* **T=2 (Primary):** `consecutive_failures` = 2. **Primary Fences Self.** (Service is DOWN).
|
||||
* **T=2 to T=5 (Cluster):** **Read-Only Phase.** No Primary exists.
|
||||
* **T=5 (Replica):** `failover_timeout` reached. Replica promotes self.
|
||||
* **Outcome:** Safe failover. The "Read-Only Gap" (T=2 to T=5) ensures no Split Brain occurred.
|
||||
|
||||
#### Case 3: Replica Network Lag (False Positive)
|
||||
|
||||
* **Scenario:** Replica has high latency, greater than `failover_timeout`; Primary is fine.
|
||||
* **Replica:** Thinks Primary is dead. Tries to promote by setting `cluster_state.current_primary = replica_id`.
|
||||
* **NATS:** Rejects the write because the Primary is still updating the sequence numbers successfully.
|
||||
* **Outcome:** Promotion denied. Primary stays leader.
|
||||
|
||||
#### Case 4: Network Instability (Flapping)
|
||||
|
||||
* **Scenario:** Intermittent packet loss.
|
||||
* **Primary:** Fails 1 heartbeat, succeeds the next. `consecutive_failures` resets.
|
||||
* **Replica:** Sees a slight delay in updates, but never reaches `failover_timeout`.
|
||||
* **Outcome:** No Fencing, No Promotion. System rides out the noise.
|
||||
|
||||
## Contextual notes
|
||||
|
||||
* Clock skew : Tokio relies on monotonic clocks. This means that `tokio::time::sleep(...)` will not be affected by system clock corrections (such as NTP). But monotonic clocks are known to jump forward in some cases such as VM live migrations. This could mean a false timeout of a single heartbeat. If `failure_threshold = 1`, this can mean a false negative on the nodes' health, and a potentially useless demotion.
|
||||
* `heartbeat_timeout == heartbeat_interval` : We intentionally do not provide two separate settings for the timeout before considering a heartbeat failed and the interval between heartbeats. It could make sense in some configurations where low network latency is required to have a small `heartbeat_timeout = 50ms` and larger `hartbeat_interval == 2s`, but we do not have a practical use case for it yet. And having timeout larger than interval does not make sense in any situation we can think of at the moment. So we decided to have a single value for both, which makes the algorithm easier to reason about and implement.
|
||||
95
docs/adr/017-staleness-detection-for-failover.md
Normal file
95
docs/adr/017-staleness-detection-for-failover.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# Architecture Decision Record: Staleness-Based Failover Mechanism & Observability
|
||||
|
||||
**Status:** Proposed
|
||||
**Date:** 2026-01-09
|
||||
**Precedes:** [016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md](https://git.nationtech.io/NationTech/harmony/raw/branch/master/adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md)
|
||||
|
||||
## Context
|
||||
|
||||
In ADR 016, we established the **Harmony Agent** and the **Global Orchestration Mesh** (powered by NATS JetStream) as the foundation for our decentralized infrastructure. We defined the high-level need for a `FailoverStrategy` that can support both financial consistency (CP) and AI availability (AP).
|
||||
|
||||
However, a specific implementation challenge remains: **How do we reliably detect node failure without losing the ability to debug the event later?**
|
||||
|
||||
Standard distributed systems often use "Key Expiration" (TTL) for heartbeats. If a key disappears, the node is presumed dead. While simple, this approach is catastrophic for post-mortem analysis. When the key expires, the evidence of *when* and *how* the failure occurred evaporates.
|
||||
|
||||
For NationTech’s vision of **Humane Computing**—where micro datacenters might be heating a family home or running a local business—reliability and diagnosability are paramount. If a cluster fails over, we owe it to the user to provide a clear, historical log of exactly what happened. We cannot build a "wonderful future for computers" on ephemeral, untraceable errors.
|
||||
|
||||
## Decision
|
||||
|
||||
We will implement a **Staleness Detection** mechanism rather than a Key Expiration mechanism. We will leverage NATS JetStream Key-Value (KV) stores with **History Enabled** to create an immutable audit trail of cluster health.
|
||||
|
||||
### 1. The "Black Box" Flight Recorder (NATS Configuration)
|
||||
We will utilize a persistent NATS KV bucket named `harmony_failover`.
|
||||
* **Storage:** File (Persistent).
|
||||
* **History:** Set to `64` (or higher). This allows us to query the last 64 heartbeat entries to visualize the exact degradation of the primary node before failure.
|
||||
* **TTL:** None. Data never disappears; it only becomes "stale."
|
||||
|
||||
### 2. Data Structures
|
||||
We will define two primary schemas to manage the state.
|
||||
|
||||
|
||||
**A. The Rules of Engagement (`cluster_config`)**
|
||||
This persistent key defines the behavior of the mesh. It allows us to tune failover sensitivity dynamically without redeploying the Agent binary.
|
||||
|
||||
```json
|
||||
{
|
||||
"primary_site_id": "site-a-basement",
|
||||
"replica_site_id": "site-b-cloud",
|
||||
"failover_timeout_ms": 5000, // Time before Replica takes over
|
||||
"heartbeat_interval_ms": 1000 // Frequency of Primary updates
|
||||
}
|
||||
```
|
||||
|
||||
> **Note :** The location for this configuration data structure is TBD. See https://git.nationtech.io/NationTech/harmony/issues/206
|
||||
|
||||
**B. The Heartbeat (`primary_heartbeat`)**
|
||||
The Primary writes this; the Replica watches it.
|
||||
|
||||
```json
|
||||
{
|
||||
"site_id": "site-a-basement",
|
||||
"status": "HEALTHY",
|
||||
"counter": 10452,
|
||||
"timestamp": 1704661549000
|
||||
}
|
||||
```
|
||||
|
||||
### 3. The Failover Algorithm
|
||||
|
||||
**The Primary (Site A) Logic:**
|
||||
The Primary's ability to write to the mesh is its "License to Operate."
|
||||
1. **Write Loop:** Attempts to write `primary_heartbeat` every `heartbeat_interval_ms`.
|
||||
2. **Self-Preservation (Fencing):** If the write fails (NATS Ack timeout or NATS unreachable), the Primary **immediately self-demotes**. It assumes it is network-isolated. This prevents Split Brain scenarios where a partitioned Primary continues to accept writes while the Replica promotes itself.
|
||||
|
||||
**The Replica (Site B) Logic:**
|
||||
The Replica acts as the watchdog.
|
||||
1. **Watch:** Subscribes to updates on `primary_heartbeat`.
|
||||
2. **Staleness Check:** Maintains a local timer. Every time a heartbeat arrives, the timer resets.
|
||||
3. **Promotion:** If the timer exceeds `failover_timeout_ms`, the Replica declares the Primary dead and promotes itself to Leader.
|
||||
4. **Yielding:** If the Replica is Leader, but suddenly receives a valid, new heartbeat from the configured `primary_site_id` (indicating the Primary has recovered), the Replica will voluntarily **demote** itself to restore the preferred topology.
|
||||
|
||||
## Rationale
|
||||
|
||||
**Observability as a First-Class Citizen**
|
||||
By keeping the last 64 heartbeats, we can run `nats kv history` to see the exact timeline. Did the Primary stop suddenly (crash)? or did the heartbeats become erratic and slow before stopping (network congestion)? This data is critical for optimizing the "Micro Data Centers" described in our vision, where internet connections in residential areas may vary in quality.
|
||||
|
||||
**Energy Efficiency & Resource Optimization**
|
||||
NationTech aims to "maximize the value of our energy." A "flapping" cluster (constantly failing over and back) wastes immense energy in data re-synchronization and startup costs. By making the `failover_timeout_ms` configurable via `cluster_config`, we can tune a cluster heating a greenhouse to be less sensitive (slower failover is fine) compared to a cluster running a payment gateway.
|
||||
|
||||
**Decentralized Trust**
|
||||
This architecture relies on NATS as the consensus engine. If the Primary is part of the NATS majority, it lives. If it isn't, it dies. This removes ambiguity and allows us to scale to thousands of independent sites without a central "God mode" controller managing every single failover.
|
||||
|
||||
## Consequences
|
||||
|
||||
**Positive**
|
||||
* **Auditability:** Every failover event leaves a permanent trace in the KV history.
|
||||
* **Safety:** The "Write Ack" check on the Primary provides a strong guarantee against Split Brain in `AbsoluteConsistency` mode.
|
||||
* **Dynamic Tuning:** We can adjust timeouts for specific environments (e.g., high-latency satellite links) by updating a JSON key, requiring no downtime.
|
||||
|
||||
**Negative**
|
||||
* **Storage Overhead:** Keeping history requires marginally more disk space on the NATS servers, though for 64 small JSON payloads, this is negligible.
|
||||
* **Clock Skew:** While we rely on NATS server-side timestamps for ordering, extreme clock skew on the client side could confuse the debug logs (though not the failover logic itself).
|
||||
|
||||
## Alignment with Vision
|
||||
This architecture supports the NationTech goal of a **"Beautifully Integrated Design."** It takes the complex, high-stakes problem of distributed consensus and wraps it in a mechanism that is robust enough for enterprise banking yet flexible enough to manage a basement server heating a swimming pool. It bridges the gap between the reliability of Web2 clouds and the decentralized nature of Web3 infrastructure.
|
||||
|
||||
233
docs/adr/020-1-zitadel-openbao-secure-config-store.md
Normal file
233
docs/adr/020-1-zitadel-openbao-secure-config-store.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# ADR 020-1: Zitadel OIDC and OpenBao Integration for the Config Store
|
||||
|
||||
Author: Jean-Gabriel Gill-Couture
|
||||
|
||||
Date: 2026-03-18
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
|
||||
ADR 020 defines a unified `harmony_config` crate with a `ConfigStore` trait. The default team-oriented backend is OpenBao, which provides encrypted storage, versioned KV, audit logging, and fine-grained access control.
|
||||
|
||||
OpenBao requires authentication. The question is how developers authenticate without introducing new credentials to manage.
|
||||
|
||||
The goals are:
|
||||
|
||||
- **Zero new credentials.** Developers log in with their existing corporate identity (Google Workspace, GitHub, or Microsoft Entra ID / Azure AD).
|
||||
- **Headless compatibility.** The flow must work over SSH, inside containers, and in CI — environments with no browser or localhost listener.
|
||||
- **Minimal friction.** After a one-time login, authentication should be invisible for weeks of active use.
|
||||
- **Centralized offboarding.** Revoking a user in the identity provider must immediately revoke their access to the config store.
|
||||
|
||||
## Decision
|
||||
|
||||
Developers authenticate to OpenBao through a two-step process: first, they obtain an OIDC token from Zitadel (`sso.nationtech.io`) using the OAuth 2.0 Device Authorization Grant (RFC 8628); then, they exchange that token for a short-lived OpenBao client token via OpenBao's JWT auth method.
|
||||
|
||||
### The authentication flow
|
||||
|
||||
#### Step 1: Trigger
|
||||
|
||||
The `ConfigManager` attempts to resolve a value via the `StoreSource`. The `StoreSource` checks for a cached OpenBao token in `~/.local/share/harmony/session.json`. If the token is missing or expired, authentication begins.
|
||||
|
||||
#### Step 2: Device Authorization Request
|
||||
|
||||
Harmony sends a `POST` to Zitadel's device authorization endpoint:
|
||||
|
||||
```
|
||||
POST https://sso.nationtech.io/oauth/v2/device_authorization
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
|
||||
client_id=<harmony_client_id>&scope=openid email profile offline_access
|
||||
```
|
||||
|
||||
Zitadel responds with:
|
||||
|
||||
```json
|
||||
{
|
||||
"device_code": "dOcbPeysDhT26ZatRh9n7Q",
|
||||
"user_code": "GQWC-FWFK",
|
||||
"verification_uri": "https://sso.nationtech.io/device",
|
||||
"verification_uri_complete": "https://sso.nationtech.io/device?user_code=GQWC-FWFK",
|
||||
"expires_in": 300,
|
||||
"interval": 5
|
||||
}
|
||||
```
|
||||
|
||||
#### Step 3: User prompt
|
||||
|
||||
Harmony prints the code and URL to the terminal:
|
||||
|
||||
```
|
||||
[Harmony] To authenticate, open your browser to:
|
||||
https://sso.nationtech.io/device
|
||||
and enter code: GQWC-FWFK
|
||||
|
||||
Or visit: https://sso.nationtech.io/device?user_code=GQWC-FWFK
|
||||
```
|
||||
|
||||
If a desktop environment is detected, Harmony also calls `open` / `xdg-open` to launch the browser automatically. The `verification_uri_complete` URL pre-fills the code, so the user only needs to click "Confirm" after logging in.
|
||||
|
||||
There is no localhost HTTP listener. The CLI does not need to bind a port or receive a callback. This is what makes the device flow work over SSH, in containers, and through corporate firewalls — unlike the `oc login` approach which spins up a temporary web server to catch a redirect.
|
||||
|
||||
#### Step 4: User login
|
||||
|
||||
The developer logs in through Zitadel's web UI using one of the configured identity providers:
|
||||
|
||||
- **Google Workspace** — for teams using Google as their corporate identity.
|
||||
- **GitHub** — for open-source or GitHub-centric teams.
|
||||
- **Microsoft Entra ID (Azure AD)** — for enterprise clients, particularly common in Quebec and the broader Canadian public sector.
|
||||
|
||||
Zitadel federates the login to the chosen provider. The developer authenticates with their existing corporate credentials. No new password is created.
|
||||
|
||||
#### Step 5: Polling
|
||||
|
||||
While the user is authenticating in the browser, Harmony polls Zitadel's token endpoint at the interval specified in the device authorization response (typically 5 seconds):
|
||||
|
||||
```
|
||||
POST https://sso.nationtech.io/oauth/v2/token
|
||||
Content-Type: application/x-www-form-urlencoded
|
||||
|
||||
grant_type=urn:ietf:params:oauth:grant-type:device_code
|
||||
&device_code=dOcbPeysDhT26ZatRh9n7Q
|
||||
&client_id=<harmony_client_id>
|
||||
```
|
||||
|
||||
Before the user completes login, Zitadel responds with `authorization_pending`. Once the user consents, Zitadel returns:
|
||||
|
||||
```json
|
||||
{
|
||||
"access_token": "...",
|
||||
"token_type": "Bearer",
|
||||
"expires_in": 3600,
|
||||
"refresh_token": "...",
|
||||
"id_token": "eyJhbGciOiJSUzI1NiIs..."
|
||||
}
|
||||
```
|
||||
|
||||
The `scope=offline_access` in the initial request is what causes Zitadel to issue a `refresh_token`.
|
||||
|
||||
#### Step 6: OpenBao JWT exchange
|
||||
|
||||
Harmony sends the `id_token` (a JWT signed by Zitadel) to OpenBao's JWT auth method:
|
||||
|
||||
```
|
||||
POST https://secrets.nationtech.io/v1/auth/jwt/login
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"role": "harmony-developer",
|
||||
"jwt": "eyJhbGciOiJSUzI1NiIs..."
|
||||
}
|
||||
```
|
||||
|
||||
OpenBao validates the JWT:
|
||||
|
||||
1. It fetches Zitadel's public keys from `https://sso.nationtech.io/oauth/v2/keys` (the JWKS endpoint).
|
||||
2. It verifies the JWT signature.
|
||||
3. It reads the claims (`email`, `groups`, and any custom claims mapped from the upstream identity provider, such as Azure AD tenant or Google Workspace org).
|
||||
4. It evaluates the claims against the `bound_claims` and `bound_audiences` configured on the `harmony-developer` role.
|
||||
5. If validation passes, OpenBao returns a client token:
|
||||
|
||||
```json
|
||||
{
|
||||
"auth": {
|
||||
"client_token": "hvs.CAES...",
|
||||
"policies": ["harmony-dev"],
|
||||
"metadata": { "role": "harmony-developer" },
|
||||
"lease_duration": 14400,
|
||||
"renewable": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Harmony caches the OpenBao token, the OIDC refresh token, and the token expiry timestamps to `~/.local/share/harmony/session.json` with `0600` file permissions.
|
||||
|
||||
### OpenBao storage structure
|
||||
|
||||
All configuration and secret state is stored in an OpenBao Versioned KV v2 engine.
|
||||
|
||||
Path taxonomy:
|
||||
|
||||
```
|
||||
harmony/<organization>/<project>/<environment>/<key>
|
||||
```
|
||||
|
||||
Examples:
|
||||
|
||||
```
|
||||
harmony/nationtech/my-app/staging/PostgresConfig
|
||||
harmony/nationtech/my-app/production/PostgresConfig
|
||||
harmony/nationtech/my-app/local-shared/PostgresConfig
|
||||
```
|
||||
|
||||
The `ConfigClass` (Standard vs. Secret) can influence OpenBao policy structure — for example, `Secret`-class paths could require stricter ACLs or additional audit backends — but the path taxonomy itself does not change. This is an operational concern configured in OpenBao policies, not a structural one enforced by path naming.
|
||||
|
||||
### Token lifecycle and silent refresh
|
||||
|
||||
The system manages three tokens with different lifetimes:
|
||||
|
||||
| Token | TTL | Max TTL | Purpose |
|
||||
|---|---|---|---|
|
||||
| OpenBao client token | 4 hours | 24 hours | Read/write config store |
|
||||
| OIDC ID token | 1 hour | — | Exchange for OpenBao token |
|
||||
| OIDC refresh token | 90 days absolute, 30 days inactivity | — | Obtain new ID tokens silently |
|
||||
|
||||
The refresh flow, from the developer's perspective:
|
||||
|
||||
1. **Same session (< 4 hours since last use).** The cached OpenBao token is still valid. No network call to Zitadel. Fastest path.
|
||||
2. **Next day (OpenBao token expired, refresh token valid).** Harmony uses the OIDC `refresh_token` to request a new `id_token` from Zitadel's token endpoint (`grant_type=refresh_token`). It then exchanges the new `id_token` for a fresh OpenBao token. This happens silently. The developer sees no prompt.
|
||||
3. **OpenBao token near max TTL (approaching 24 hours of cumulative renewals).** Instead of renewing, Harmony re-authenticates using the refresh token to get a completely fresh OpenBao token. Transparent to the user.
|
||||
4. **After 30 days of inactivity.** The OIDC refresh token expires. Harmony falls back to the device flow (Step 2 above) and prompts the user to re-authenticate in the browser. This is the only scenario where a returning developer sees a login prompt.
|
||||
5. **User offboarded.** An administrator revokes the user's account or group membership in Zitadel. The next time the refresh token is used, Zitadel rejects it. The device flow also fails because the user can no longer authenticate. Access is terminated without any action needed on the OpenBao side.
|
||||
|
||||
OpenBao token renewal uses the `/auth/token/renew-self` endpoint with the `X-Vault-Token` header. Harmony renews proactively at ~75% of the TTL to avoid race conditions.
|
||||
|
||||
### OpenBao role configuration
|
||||
|
||||
The OpenBao JWT auth role for Harmony developers:
|
||||
|
||||
```bash
|
||||
bao write auth/jwt/config \
|
||||
oidc_discovery_url="https://sso.nationtech.io" \
|
||||
bound_issuer="https://sso.nationtech.io"
|
||||
|
||||
bao write auth/jwt/role/harmony-developer \
|
||||
role_type="jwt" \
|
||||
bound_audiences="<harmony_client_id>" \
|
||||
user_claim="email" \
|
||||
groups_claim="urn:zitadel:iam:org:project:roles" \
|
||||
policies="harmony-dev" \
|
||||
ttl="4h" \
|
||||
max_ttl="24h" \
|
||||
token_type="service"
|
||||
```
|
||||
|
||||
The `bound_audiences` claim ties the role to the specific Harmony Zitadel application. The `groups_claim` allows mapping Zitadel project roles to OpenBao policies for per-team or per-project access control.
|
||||
|
||||
### Self-hosted deployments
|
||||
|
||||
For organizations running their own infrastructure, the same architecture applies. The operator deploys Zitadel and OpenBao using Harmony's existing `ZitadelScore` and `OpenbaoScore`. The only configuration needed is three environment variables (or their equivalents in the bootstrap config):
|
||||
|
||||
- `HARMONY_SSO_URL` — the Zitadel instance URL.
|
||||
- `HARMONY_SECRETS_URL` — the OpenBao instance URL.
|
||||
- `HARMONY_SSO_CLIENT_ID` — the Zitadel application client ID.
|
||||
|
||||
None of these are secrets. They can be committed to an infrastructure repository or distributed via any convenient channel.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
- Developers authenticate with existing corporate credentials. No new passwords, no static tokens to distribute.
|
||||
- The device flow works in every environment: local terminal, SSH, containers, CI runners, corporate VPNs.
|
||||
- Silent token refresh keeps developers authenticated for weeks without any manual intervention.
|
||||
- User offboarding is a single action in Zitadel. No OpenBao token rotation or manual revocation required.
|
||||
- Azure AD / Microsoft Entra ID support addresses the enterprise and public sector market.
|
||||
|
||||
### Negative
|
||||
|
||||
- The OAuth state machine (device code polling, token refresh, error handling) adds implementation complexity compared to a static token approach.
|
||||
- Developers must have network access to `sso.nationtech.io` and `secrets.nationtech.io` to pull or push configuration state. True offline work falls back to the local file store, which does not sync with the team.
|
||||
- The first login per machine requires a browser interaction. Fully headless first-run scenarios (e.g., a fresh CI runner with no pre-seeded tokens) must use `EnvSource` overrides or a service account JWT.
|
||||
177
docs/adr/020-interactive-configuration-crate.md
Normal file
177
docs/adr/020-interactive-configuration-crate.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# ADR 020: Unified Configuration and Secret Management
|
||||
|
||||
Author: Jean-Gabriel Gill-Couture
|
||||
|
||||
Date: 2026-03-18
|
||||
|
||||
## Status
|
||||
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
|
||||
Harmony's orchestration logic depends on runtime data that falls into two categories:
|
||||
|
||||
1. **Secrets** — credentials, tokens, private keys.
|
||||
2. **Operational configuration** — deployment targets, host selections, port assignments, reboot decisions, and similar contextual choices.
|
||||
|
||||
Both categories share the same fundamental lifecycle: a value must be acquired before execution can proceed, it may come from several backends (environment variable, remote store, interactive prompt), and it must be shareable across a team without polluting the Git repository.
|
||||
|
||||
Treating these categories as separate subsystems forces developers to choose between a "config API" and a "secret API" at every call site. The only meaningful difference between the two is how the storage backend handles the data (plaintext vs. encrypted, audited vs. unaudited) and how the CLI displays it (visible vs. masked). That difference belongs in the backend, not in the application code.
|
||||
|
||||
Three concrete problems drive this change:
|
||||
|
||||
- **Async terminal corruption.** `inquire` prompts assume exclusive terminal ownership. Background tokio tasks emitting log output during a prompt corrupt the terminal state. This is inherent to Harmony's concurrent orchestration model.
|
||||
- **Untestable code paths.** Any function containing an inline `inquire` call requires a real TTY to execute. Unit testing is impossible without ignoring the test entirely.
|
||||
- **No backend integration.** Inline prompts cannot be answered from a remote store, an environment variable, or a CI pipeline. Every automated deployment that passes through a prompting code path requires a human operator at a terminal.
|
||||
|
||||
## Decision
|
||||
|
||||
A single workspace crate, `harmony_config`, provides all configuration and secret acquisition for Harmony. It replaces both `harmony_secret` and all inline `inquire` usage.
|
||||
|
||||
### Schema in Git, state in the store
|
||||
|
||||
The Rust type system serves as the configuration schema. Developers declare what configuration is needed by defining structs:
|
||||
|
||||
```rust
|
||||
#[derive(Config, Serialize, Deserialize, JsonSchema, InteractiveParse)]
|
||||
struct PostgresConfig {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
These structs live in Git and evolve with the code. When a branch introduces a new field, Git tracks that schema change. The actual values live in an external store — OpenBao by default. No `.env` files, no JSON config files, no YAML in the repository.
|
||||
|
||||
### Data classification
|
||||
|
||||
```rust
|
||||
/// Tells the storage backend how to handle the data.
|
||||
pub enum ConfigClass {
|
||||
/// Plaintext storage is acceptable.
|
||||
Standard,
|
||||
/// Must be encrypted at rest, masked in UI, subject to audit logging.
|
||||
Secret,
|
||||
}
|
||||
```
|
||||
|
||||
Classification is determined at the struct level. A struct with no `#[config(secret)]` fields has `ConfigClass::Standard`. A struct with one or more `#[config(secret)]` fields is elevated to `ConfigClass::Secret`. The struct is always stored as a single cohesive JSON blob; field-level splitting across backends is not a concern of the trait.
|
||||
|
||||
The `#[config(secret)]` attribute also instructs the `PromptSource` to mask terminal input for that field during interactive prompting.
|
||||
|
||||
### The Config trait
|
||||
|
||||
```rust
|
||||
pub trait Config: Serialize + DeserializeOwned + JsonSchema + InteractiveParseObj + Sized {
|
||||
/// Stable lookup key. By default, the struct name.
|
||||
const KEY: &'static str;
|
||||
|
||||
/// How the backend should treat this data.
|
||||
const CLASS: ConfigClass;
|
||||
}
|
||||
```
|
||||
|
||||
A `#[derive(Config)]` proc macro generates the implementation. The macro inspects field attributes to determine `CLASS`.
|
||||
|
||||
### The ConfigStore trait
|
||||
|
||||
```rust
|
||||
#[async_trait]
|
||||
pub trait ConfigStore: Send + Sync {
|
||||
async fn get(
|
||||
&self,
|
||||
class: ConfigClass,
|
||||
namespace: &str,
|
||||
key: &str,
|
||||
) -> Result<Option<serde_json::Value>, ConfigError>;
|
||||
|
||||
async fn set(
|
||||
&self,
|
||||
class: ConfigClass,
|
||||
namespace: &str,
|
||||
key: &str,
|
||||
value: &serde_json::Value,
|
||||
) -> Result<(), ConfigError>;
|
||||
}
|
||||
```
|
||||
|
||||
The `class` parameter is a hint. The store implementation decides what to do with it. An OpenBao store may route `Secret` data to a different path prefix or apply stricter ACLs. A future store could split fields across backends — that is an implementation concern, not a trait concern.
|
||||
|
||||
### Resolution chain
|
||||
|
||||
The `ConfigManager` tries sources in priority order:
|
||||
|
||||
1. **`EnvSource`** — reads `HARMONY_CONFIG_{KEY}` as a JSON string. Override hatch for CI/CD pipelines and containerized environments.
|
||||
2. **`StoreSource`** — wraps a `ConfigStore` implementation. For teams, this is the OpenBao backend authenticated via Zitadel OIDC (see ADR 020-1).
|
||||
3. **`PromptSource`** — presents an `interactive-parse` prompt on the terminal. Acquires a process-wide async mutex before rendering to prevent log output corruption.
|
||||
|
||||
When `PromptSource` obtains a value, the `ConfigManager` persists it back to the `StoreSource` so that subsequent runs — by the same developer or any teammate — resolve without prompting.
|
||||
|
||||
Callers that do not include `PromptSource` in their source list never block on a TTY. Test code passes empty source lists and constructs config structs directly.
|
||||
|
||||
### Schema versioning
|
||||
|
||||
The Rust struct is the schema. When a developer renames a field, removes a field, or changes a type on a branch, the store may still contain data shaped for a previous version of the struct. If another team member who does not yet have that commit runs the code, `serde_json::from_value` will fail on the stale entry.
|
||||
|
||||
In the initial implementation, the resolution chain handles this gracefully: a deserialization failure is treated as a cache miss, and the `PromptSource` fires. The prompted value overwrites the stale entry in the store.
|
||||
|
||||
This is sufficient for small teams working on short-lived branches. It is not sufficient at scale, where silent re-prompting could mask real configuration drift.
|
||||
|
||||
A future iteration will introduce a compile-time schema migration mechanism, similar to how `sqlx` verifies queries against a live database at compile time. The mechanism will:
|
||||
|
||||
- Detect schema drift between the Rust struct and the stored JSON.
|
||||
- Apply named, ordered migration functions to transform stored data forward.
|
||||
- Reject ambiguous migrations at compile time rather than silently corrupting state.
|
||||
|
||||
Until that mechanism exists, teams should treat store entries as soft caches: the struct definition is always authoritative, and the store is best-effort.
|
||||
|
||||
## Rationale
|
||||
|
||||
**Why merge secrets and config into one crate?** Separate crates with nearly identical trait shapes (`Secret` vs `Config`, `SecretStore` vs `ConfigStore`) force developers to make a classification decision at every call site. A unified crate with a `ConfigClass` discriminator moves that decision to the struct definition, where it belongs.
|
||||
|
||||
**Why OpenBao as the default backend?** OpenBao is a fully open-source Vault fork under the Linux Foundation. It runs on-premises with no phone-home requirement — a hard constraint for private cloud and regulated environments. Harmony already deploys OpenBao for clients (`OpenbaoScore`), so no new infrastructure is introduced.
|
||||
|
||||
**Why not store values in Git (e.g., encrypted YAML)?** Git-tracked config files create merge conflicts, require re-encryption on team membership changes, and leak metadata (file names, key names) even when values are encrypted. Storing state in OpenBao avoids all of these issues and provides audit logging, access control, and versioned KV out of the box.
|
||||
|
||||
**Why keep `PromptSource`?** Removing interactive prompts entirely would break the zero-infrastructure bootstrapping path and eliminate human-confirmation safety gates for destructive operations (interface reconfiguration, node reboot). The problem was never that prompts exist — it is that they were unavoidable and untestable. Making `PromptSource` an explicit, opt-in entry in the source list restores control.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
- A single API surface for all runtime data acquisition.
|
||||
- All currently-ignored tests become runnable without TTY access.
|
||||
- Async terminal corruption is eliminated by the process-wide prompt mutex.
|
||||
- The bootstrapping path requires no infrastructure for a first run; `PromptSource` alone is sufficient.
|
||||
- The team path (OpenBao + Zitadel) reuses infrastructure Harmony already deploys.
|
||||
- User offboarding is a single Zitadel action.
|
||||
|
||||
### Negative
|
||||
|
||||
- Migrating all inline `inquire` and `harmony_secret` call sites is a significant refactoring effort.
|
||||
- Until the schema migration mechanism is built, store entries for renamed or removed fields become stale and must be re-prompted.
|
||||
- The Zitadel device flow introduces a browser step on first login per machine.
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Trait design and crate restructure
|
||||
|
||||
Refactor `harmony_config` to define the final `Config`, `ConfigClass`, and `ConfigStore` traits. Update the derive macro to support `#[config(secret)]` and generate the correct `CLASS` constant. Implement `EnvSource` and `PromptSource` against the new traits. Write comprehensive unit tests using mock stores.
|
||||
|
||||
### Phase 2: Absorb `harmony_secret`
|
||||
|
||||
Migrate the `OpenbaoSecretStore`, `InfisicalSecretStore`, and `LocalFileSecretStore` implementations from `harmony_secret` into `harmony_config` as `ConfigStore` backends. Update all call sites that use `SecretManager::get`, `SecretManager::get_or_prompt`, or `SecretManager::set` to use `harmony_config` equivalents.
|
||||
|
||||
### Phase 3: Migrate inline prompts
|
||||
|
||||
Replace all inline `inquire` call sites in the `harmony` crate (`infra/brocade.rs`, `infra/network_manager.rs`, `modules/okd/host_network.rs`, and others) with `harmony_config` structs and `get_or_prompt` calls. Un-ignore the affected tests.
|
||||
|
||||
### Phase 4: Zitadel and OpenBao integration
|
||||
|
||||
Implement the authentication flow described in ADR 020-1. Wire `StoreSource` to use Zitadel OIDC tokens for OpenBao access. Implement token caching and silent refresh.
|
||||
|
||||
### Phase 5: Remove `harmony_secret`
|
||||
|
||||
Delete the `harmony_secret` and `harmony_secret_derive` crates from the workspace. All functionality now lives in `harmony_config`.
|
||||
63
docs/adr/README.md
Normal file
63
docs/adr/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Architecture Decision Records
|
||||
|
||||
An Architecture Decision Record (ADR) documents a significant architectural decision made during the development of Harmony — along with its context, rationale, and consequences.
|
||||
|
||||
## Why We Use ADRs
|
||||
|
||||
As a platform engineering framework used by a team, Harmony accumulates technical decisions over time. ADRs help us:
|
||||
|
||||
- **Track rationale** — understand _why_ a decision was made, not just _what_ was decided
|
||||
- ** onboard new contributors** — the "why" is preserved even when team membership changes
|
||||
- **Avoid repeating past mistakes** — previous decisions and their context are searchable
|
||||
- **Manage technical debt** — ADRs make it easier to revisit and revise past choices
|
||||
|
||||
An ADR captures a decision at a point in time. It is not a specification — it is a record of reasoning.
|
||||
|
||||
## ADR Format
|
||||
|
||||
Every ADR follows this structure:
|
||||
|
||||
| Section | Purpose |
|
||||
|---------|---------|
|
||||
| **Status** | Proposed / Pending / Accepted / Implemented / Deprecated |
|
||||
| **Context** | The problem or background — the "why" behind this decision |
|
||||
| **Decision** | The chosen solution or direction |
|
||||
| **Rationale** | Reasoning behind the decision |
|
||||
| **Consequences** | Both positive and negative outcomes |
|
||||
| **Alternatives considered** | Other options that were evaluated |
|
||||
| **Additional Notes** | Supplementary context, links, or open questions |
|
||||
|
||||
## ADR Index
|
||||
|
||||
| Number | Title | Status |
|
||||
|--------|-------|--------|
|
||||
| [000](./000-ADR-Template.md) | ADR Template | Reference |
|
||||
| [001](./001-rust.md) | Why Rust | Accepted |
|
||||
| [002](./002-hexagonal-architecture.md) | Hexagonal Architecture | Accepted |
|
||||
| [003](./003-infrastructure-abstractions.md) | Infrastructure Abstractions | Accepted |
|
||||
| [004](./004-ipxe.md) | iPXE | Accepted |
|
||||
| [005](./005-interactive-project.md) | Interactive Project | Proposed |
|
||||
| [006](./006-secret-management.md) | Secret Management | Accepted |
|
||||
| [007](./007-default-runtime.md) | Default Runtime | Accepted |
|
||||
| [008](./008-score-display-formatting.md) | Score Display Formatting | Proposed |
|
||||
| [009](./009-helm-and-kustomize-handling.md) | Helm and Kustomize Handling | Accepted |
|
||||
| [010](./010-monitoring-and-alerting.md) | Monitoring and Alerting | Accepted |
|
||||
| [011](./011-multi-tenant-cluster.md) | Multi-Tenant Cluster | Accepted |
|
||||
| [012](./012-project-delivery-automation.md) | Project Delivery Automation | Proposed |
|
||||
| [013](./013-monitoring-notifications.md) | Monitoring Notifications | Accepted |
|
||||
| [015](./015-higher-order-topologies.md) | Higher Order Topologies | Proposed |
|
||||
| [016](./016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md) | Harmony Agent and Global Mesh | Proposed |
|
||||
| [017-1](./017-1-Nats-Clusters-Interconnection-Topology.md) | NATS Clusters Interconnection Topology | Proposed |
|
||||
| [018](./018-Template-Hydration-For-Workload-Deployment.md) | Template Hydration for Workload Deployment | Proposed |
|
||||
| [019](./019-Network-bond-setup.md) | Network Bond Setup | Proposed |
|
||||
| [020-1](./020-1-zitadel-openbao-secure-config-store.md) | Zitadel + OpenBao Secure Config Store | Accepted |
|
||||
| [020](./020-interactive-configuration-crate.md) | Interactive Configuration Crate | Proposed |
|
||||
|
||||
## Contributing
|
||||
|
||||
When making a significant technical change:
|
||||
|
||||
1. **Check existing ADRs** — the decision may already be documented
|
||||
2. **Create a new ADR** using the [template](./000-ADR-Template.md) if the change warrants architectural discussion
|
||||
3. **Set status to Proposed** and open it for team review
|
||||
4. Once accepted and implemented, update the status accordingly
|
||||
@@ -84,7 +84,7 @@ Network services that run inside the cluster or as part of the topology.
|
||||
- **OKDLoadBalancerScore**: Configures the high-availability load balancers for the OKD API and ingress.
|
||||
- **OKDBootstrapLoadBalancerScore**: Configures the load balancer specifically for the bootstrap-time API endpoint.
|
||||
- **K8sIngressScore**: Configures an Ingress controller or resource.
|
||||
- [HighAvailabilityHostNetworkScore](../../harmony/src/modules/okd/host_network.rs): Configures network bonds on a host and the corresponding port-channels on the switch stack for high-availability.
|
||||
- **HighAvailabilityHostNetworkScore**: Configures network bonds on a host and the corresponding port-channels on the switch stack for high-availability.
|
||||
|
||||
## Tenant Management
|
||||
|
||||
|
||||
229
docs/coding-guide.md
Normal file
229
docs/coding-guide.md
Normal file
@@ -0,0 +1,229 @@
|
||||
# Harmony Coding Guide
|
||||
|
||||
Harmony is an infrastructure automation framework. It is **code-first and code-only**: operators write Rust programs to declare and drive infrastructure, rather than YAML files or DSL configs. Good code here means a good operator experience.
|
||||
|
||||
### Concrete context
|
||||
|
||||
We use here the context of the KVM module to explain the coding style. This will make it very easy to understand and should translate quite well to other modules/contexts managed by Harmony like OPNSense and Kubernetes.
|
||||
|
||||
## Core Philosophy
|
||||
|
||||
### High-level functions over raw primitives
|
||||
|
||||
Callers should not need to know about underlying protocols, XML schemas, or API quirks. A function that deploys a VM should accept meaningful parameters like CPU count, memory, and network name — not XML strings.
|
||||
|
||||
```rust
|
||||
// Bad: caller constructs XML and passes it to a thin wrapper
|
||||
let xml = format!(r#"<domain type='kvm'>...</domain>"#, name, memory_kb, ...);
|
||||
executor.create_vm(&xml).await?;
|
||||
|
||||
// Good: caller describes intent, the module handles representation
|
||||
executor.define_vm(&VmConfig::builder("my-vm")
|
||||
.cpu(4)
|
||||
.memory_gb(8)
|
||||
.disk(DiskConfig::new(50))
|
||||
.network(NetworkRef::named("mylan"))
|
||||
.boot_order([BootDevice::Network, BootDevice::Disk])
|
||||
.build())
|
||||
.await?;
|
||||
```
|
||||
|
||||
The module owns the XML, the virsh invocations, the API calls — not the caller.
|
||||
|
||||
### Use the right abstraction layer
|
||||
|
||||
Prefer native library bindings over shelling out to CLI tools. The `virt` crate provides direct libvirt bindings and should be used instead of spawning `virsh` subprocesses.
|
||||
|
||||
- CLI subprocess calls are fragile: stdout/stderr parsing, exit codes, quoting, PATH differences
|
||||
- Native bindings give typed errors, no temp files, no shell escaping
|
||||
- `virt::connect::Connect` opens a connection; `virt::domain::Domain` manages VMs; `virt::network::Network` manages virtual networks
|
||||
|
||||
### Keep functions small and well-named
|
||||
|
||||
Each function should do one thing. If a function is doing two conceptually separate things, split it. Function names should read like plain English: `ensure_network_active`, `define_vm`, `vm_is_running`.
|
||||
|
||||
### Prefer short modules over large files
|
||||
|
||||
Group related types and functions by concept. A module that handles one resource (e.g., network, domain, storage) is better than a single file for everything.
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Use `thiserror` for all error types
|
||||
|
||||
Define error types with `thiserror::Error`. This removes the boilerplate of implementing `Display` and `std::error::Error` by hand, keeps error messages close to their variants, and makes types easy to extend.
|
||||
|
||||
```rust
|
||||
// Bad: hand-rolled Display + std::error::Error
|
||||
#[derive(Debug)]
|
||||
pub enum KVMError {
|
||||
ConnectionError(String),
|
||||
VMNotFound(String),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for KVMError { ... }
|
||||
impl std::error::Error for KVMError {}
|
||||
|
||||
// Good: derive Display via thiserror
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum KVMError {
|
||||
#[error("connection failed: {0}")]
|
||||
ConnectionFailed(String),
|
||||
#[error("VM not found: {name}")]
|
||||
VmNotFound { name: String },
|
||||
}
|
||||
```
|
||||
|
||||
### Make bubbling errors easy with `?` and `From`
|
||||
|
||||
`?` works on any error type for which there is a `From` impl. Add `From` conversions from lower-level errors into your module's error type so callers can use `?` without boilerplate.
|
||||
|
||||
With `thiserror`, wrapping a foreign error is one line:
|
||||
|
||||
```rust
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum KVMError {
|
||||
#[error("libvirt error: {0}")]
|
||||
Libvirt(#[from] virt::error::Error),
|
||||
|
||||
#[error("IO error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
```
|
||||
|
||||
This means a call that returns `virt::error::Error` can be `?`-propagated into a `Result<_, KVMError>` without any `.map_err(...)`.
|
||||
|
||||
### Typed errors over stringly-typed errors
|
||||
|
||||
Avoid `Box<dyn Error>` or `String` as error return types in library code. Callers need to distinguish errors programmatically — `KVMError::VmAlreadyExists` is actionable, `"VM already exists: foo"` as a `String` is not.
|
||||
|
||||
At binary entry points (e.g., `main`) it is acceptable to convert to `String` or `anyhow::Error` for display.
|
||||
|
||||
---
|
||||
|
||||
## Logging
|
||||
|
||||
### Use the `log` crate macros
|
||||
|
||||
All log output must go through the `log` crate. Never use `println!`, `eprintln!`, or `dbg!` in library code. This makes output compatible with any logging backend (env_logger, tracing, structured logging, etc.).
|
||||
|
||||
```rust
|
||||
// Bad
|
||||
println!("Creating VM: {}", name);
|
||||
|
||||
// Good
|
||||
use log::{info, debug, warn};
|
||||
info!("Creating VM: {name}");
|
||||
debug!("VM XML:\n{xml}");
|
||||
warn!("Network already active, skipping creation");
|
||||
```
|
||||
|
||||
Use the right level:
|
||||
|
||||
| Level | When to use |
|
||||
|---------|-------------|
|
||||
| `error` | Unrecoverable failures (before returning Err) |
|
||||
| `warn` | Recoverable issues, skipped steps |
|
||||
| `info` | High-level progress events visible in normal operation |
|
||||
| `debug` | Detailed operational info useful for debugging |
|
||||
| `trace` | Very granular, per-iteration or per-call data |
|
||||
|
||||
Log before significant operations and after unexpected conditions. Do not log inside tight loops at `info` level.
|
||||
|
||||
---
|
||||
|
||||
## Types and Builders
|
||||
|
||||
### Derive `Serialize` on all public domain types
|
||||
|
||||
All public structs and enums that represent configuration or state should derive `serde::Serialize`. Add `Deserialize` when round-trip serialization is needed.
|
||||
|
||||
### Builder pattern for complex configs
|
||||
|
||||
When a type has more than three fields or optional fields, provide a builder. The builder pattern allows named, incremental construction without positional arguments.
|
||||
|
||||
```rust
|
||||
let config = VmConfig::builder("bootstrap")
|
||||
.cpu(4)
|
||||
.memory_gb(8)
|
||||
.disk(DiskConfig::new(50).labeled("os"))
|
||||
.disk(DiskConfig::new(100).labeled("data"))
|
||||
.network(NetworkRef::named("harmonylan"))
|
||||
.boot_order([BootDevice::Network, BootDevice::Disk])
|
||||
.build();
|
||||
```
|
||||
|
||||
### Avoid `pub` fields on config structs
|
||||
|
||||
Expose data through methods or the builder, not raw field access. This preserves the ability to validate, rename, or change representation without breaking callers.
|
||||
|
||||
---
|
||||
|
||||
## Async
|
||||
|
||||
### Use `tokio` for all async runtime needs
|
||||
|
||||
All async code runs on tokio. Use `tokio::spawn`, `tokio::time`, etc. Use `#[async_trait]` for traits with async methods.
|
||||
|
||||
### No blocking in async context
|
||||
|
||||
Never call blocking I/O (file I/O, network, process spawn) directly in an async function. Use `tokio::fs`, `tokio::process`, or `tokio::task::spawn_blocking` as appropriate.
|
||||
|
||||
---
|
||||
|
||||
## Module Structure
|
||||
|
||||
### Follow the `Score` / `Interpret` pattern
|
||||
|
||||
Modules that represent deployable infrastructure should implement `Score<T: Topology>` and `Interpret<T>`:
|
||||
|
||||
- `Score` is the serializable, clonable configuration declaring *what* to deploy
|
||||
- `Interpret` does the actual work when `execute()` is called
|
||||
|
||||
```rust
|
||||
pub struct KvmScore {
|
||||
network: NetworkConfig,
|
||||
vms: Vec<VmConfig>,
|
||||
}
|
||||
|
||||
impl<T: Topology + KvmHost> Score<T> for KvmScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(KvmInterpret::new(self.clone()))
|
||||
}
|
||||
fn name(&self) -> String { "KvmScore".to_string() }
|
||||
}
|
||||
```
|
||||
|
||||
### Flatten the public API in `mod.rs`
|
||||
|
||||
Internal submodules are implementation detail. Re-export what callers need at the module root:
|
||||
|
||||
```rust
|
||||
// modules/kvm/mod.rs
|
||||
mod connection;
|
||||
mod domain;
|
||||
mod network;
|
||||
mod error;
|
||||
mod xml;
|
||||
|
||||
pub use connection::KvmConnection;
|
||||
pub use domain::{VmConfig, VmConfigBuilder, VmStatus, DiskConfig, BootDevice};
|
||||
pub use error::KvmError;
|
||||
pub use network::NetworkConfig;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Commit Style
|
||||
|
||||
Follow [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/):
|
||||
|
||||
```
|
||||
feat(kvm): add network isolation support
|
||||
fix(kvm): correct memory unit conversion for libvirt
|
||||
refactor(kvm): replace virsh subprocess calls with virt crate bindings
|
||||
docs: add coding guide
|
||||
```
|
||||
|
||||
Keep pull requests small and single-purpose (under ~200 lines excluding generated code). Do not mix refactoring, bug fixes, and new features in one PR.
|
||||
@@ -28,6 +28,11 @@ Harmony's design is based on a few key concepts. Understanding them is the key t
|
||||
- **What it is:** An **Inventory** is the physical material (the "what") used in a cluster. This is most relevant for bare-metal or on-premise topologies.
|
||||
- **Example:** A list of nodes with their roles (control plane, worker), CPU, RAM, and network interfaces. For the `K8sAnywhereTopology`, the inventory might be empty or autoloaded, as the infrastructure is more abstract.
|
||||
|
||||
### 6. Configuration & Secrets
|
||||
|
||||
- **What it is:** Configuration represents the runtime data required to deploy your `Scores`. This includes both non-sensitive state (like cluster hostnames, deployment profiles) and sensitive secrets (like API keys, database passwords).
|
||||
- **How it works:** See the [Configuration Concept Guide](./concepts/configuration.md) to understand Harmony's unified approach to managing schema in Git and state in OpenBao.
|
||||
|
||||
---
|
||||
|
||||
### How They Work Together (The Compile-Time Check)
|
||||
|
||||
107
docs/concepts/configuration.md
Normal file
107
docs/concepts/configuration.md
Normal file
@@ -0,0 +1,107 @@
|
||||
# Configuration and Secrets
|
||||
|
||||
Harmony treats configuration and secrets as a single concern. Developers use one crate, `harmony_config`, to declare, store, and retrieve all runtime data — whether it is a public hostname or a database password.
|
||||
|
||||
## The mental model: schema in Git, state in the store
|
||||
|
||||
### Schema
|
||||
|
||||
In Harmony, the Rust code is the configuration schema. You declare what your module needs by defining a struct:
|
||||
|
||||
```rust
|
||||
#[derive(Config, Serialize, Deserialize, JsonSchema, InteractiveParse)]
|
||||
struct PostgresConfig {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
This struct is tracked in Git. When a branch adds a new field, Git tracks that the branch requires a new value. When a branch removes a field, the old value in the store becomes irrelevant. The struct is always authoritative.
|
||||
|
||||
### State
|
||||
|
||||
The actual values live in a config store — by default, OpenBao. No `.env` files, no JSON, no YAML in the repository.
|
||||
|
||||
When you run your code, Harmony reads the struct (schema) and resolves values from the store (state):
|
||||
|
||||
- If the store has the value, it is injected seamlessly.
|
||||
- If the store does not have it, Harmony prompts you in the terminal. Your answer is pushed back to the store automatically.
|
||||
- When a teammate runs the same code, they are not prompted — you already provided the value.
|
||||
|
||||
### How branch switching works
|
||||
|
||||
Because the schema is just Rust code tracked in Git, branch switching works naturally:
|
||||
|
||||
1. You check out `feat/redis`. The code now requires `RedisConfig`.
|
||||
2. You run `cargo run`. Harmony detects that `RedisConfig` has no value in the store. It prompts you.
|
||||
3. You provide the values. Harmony pushes them to OpenBao.
|
||||
4. Your teammate checks out `feat/redis` and runs `cargo run`. No prompt — the values are already in the store.
|
||||
5. You switch back to `main`. `RedisConfig` does not exist in that branch's code. The store entry is ignored.
|
||||
|
||||
## Secrets vs. standard configuration
|
||||
|
||||
From your application code, there is no difference. You always call `harmony_config::get_or_prompt::<T>()`.
|
||||
|
||||
The difference is in the struct definition:
|
||||
|
||||
```rust
|
||||
// Standard config — stored in plaintext, displayed during prompting.
|
||||
#[derive(Config)]
|
||||
struct ClusterConfig {
|
||||
pub api_url: String,
|
||||
pub namespace: String,
|
||||
}
|
||||
|
||||
// Contains a secret field — the entire struct is stored encrypted,
|
||||
// and the password field is masked during terminal prompting.
|
||||
#[derive(Config)]
|
||||
struct DatabaseConfig {
|
||||
pub host: String,
|
||||
#[config(secret)]
|
||||
pub password: String,
|
||||
}
|
||||
```
|
||||
|
||||
If a struct contains any `#[config(secret)]` field, Harmony elevates the entire struct to `ConfigClass::Secret`. The storage backend decides what that means in practice — in the case of OpenBao, it may route the data to a path with stricter ACLs or audit policies.
|
||||
|
||||
## Authentication and team sharing
|
||||
|
||||
Harmony uses Zitadel (hosted at `sso.nationtech.io`) for identity and OpenBao (hosted at `secrets.nationtech.io`) for storage.
|
||||
|
||||
**First run on a new machine:**
|
||||
|
||||
1. Harmony detects that you are not logged in.
|
||||
2. It prints a short code and URL to your terminal, and opens your browser if possible.
|
||||
3. You log in with your corporate identity (Google, GitHub, or Microsoft Entra ID / Azure AD).
|
||||
4. Harmony receives an OIDC token, exchanges it for an OpenBao token, and caches the session locally.
|
||||
|
||||
**Subsequent runs:**
|
||||
|
||||
- Harmony silently refreshes your tokens in the background. You do not need to log in again for up to 90 days of active use.
|
||||
- If you are inactive for 30 days, or if an administrator revokes your access in Zitadel, you will be prompted to re-authenticate.
|
||||
|
||||
**Offboarding:**
|
||||
|
||||
Revoking a user in Zitadel immediately invalidates their ability to refresh tokens or obtain new ones. No manual secret rotation is required.
|
||||
|
||||
## Resolution chain
|
||||
|
||||
When Harmony resolves a config value, it tries sources in order:
|
||||
|
||||
1. **Environment variable** (`HARMONY_CONFIG_{KEY}`) — highest priority. Use this in CI/CD to override any value without touching the store.
|
||||
2. **Config store** (OpenBao for teams, local file for solo/offline use) — the primary source for shared team state.
|
||||
3. **Interactive prompt** — last resort. Prompts the developer and persists the answer back to the store.
|
||||
|
||||
## Schema versioning
|
||||
|
||||
The Rust struct is the single source of truth for what configuration looks like. If a developer renames or removes a field on a branch, the store may still contain data shaped for the old version of the struct. When another developer who does not have that change runs the code, deserialization will fail.
|
||||
|
||||
In the current implementation, this is handled gracefully: a deserialization failure is treated as a miss, and Harmony re-prompts. The new answer overwrites the stale entry.
|
||||
|
||||
A compile-time migration mechanism is planned for a future release to handle this more rigorously at scale.
|
||||
|
||||
## Offline and local development
|
||||
|
||||
If you are working offline or evaluating Harmony without a team OpenBao instance, the `StoreSource` falls back to a local file store at `~/.local/share/harmony/config/`. The developer experience is identical — prompting, caching, and resolution all work the same way. The only difference is that the state is local to your machine and not shared with teammates.
|
||||
135
docs/guides/adding-capabilities.md
Normal file
135
docs/guides/adding-capabilities.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# Adding Capabilities
|
||||
|
||||
`Capabilities` are trait methods that a `Topology` exposes to Scores. They are the "how" — the specific APIs and features that let a Score translate intent into infrastructure actions.
|
||||
|
||||
## How Capabilities Work
|
||||
|
||||
When a Score declares it needs certain Capabilities:
|
||||
|
||||
```rust
|
||||
impl<T: Topology + K8sclient + HelmCommand> Score<T> for MyScore {
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
The compiler verifies that the target `Topology` implements both `K8sclient` and `HelmCommand`. If it doesn't, compilation fails. This is the compile-time safety check that prevents invalid configurations from reaching production.
|
||||
|
||||
## Built-in Capabilities
|
||||
|
||||
Harmony provides a set of standard Capabilities:
|
||||
|
||||
| Capability | What it provides |
|
||||
|------------|------------------|
|
||||
| `K8sclient` | A Kubernetes API client |
|
||||
| `HelmCommand` | A configured `helm` CLI invocation |
|
||||
| `TlsRouter` | TLS certificate management |
|
||||
| `NetworkManager` | Host network configuration |
|
||||
| `SwitchClient` | Network switch configuration |
|
||||
| `CertificateManagement` | Certificate issuance via cert-manager |
|
||||
|
||||
## Implementing a Capability
|
||||
|
||||
Capabilities are implemented as trait methods on your Topology:
|
||||
|
||||
```rust
|
||||
use std::sync::Arc;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony::topology::K8sclient;
|
||||
|
||||
pub struct MyTopology {
|
||||
kubeconfig: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl K8sclient for MyTopology {
|
||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||
let client = match &self.kubeconfig {
|
||||
Some(path) => K8sClient::from_kubeconfig(path).await?,
|
||||
None => K8sClient::try_default().await?,
|
||||
};
|
||||
Ok(Arc::new(client))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Adding a Custom Capability
|
||||
|
||||
For specialized infrastructure needs, add your own Capability as a trait:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use crate::executors::ExecutorError;
|
||||
|
||||
/// A capability for configuring network switches
|
||||
#[async_trait]
|
||||
pub trait SwitchClient: Send + Sync {
|
||||
async fn configure_port(
|
||||
&self,
|
||||
switch: &str,
|
||||
port: &str,
|
||||
vlan: u16,
|
||||
) -> Result<(), ExecutorError>;
|
||||
|
||||
async fn configure_port_channel(
|
||||
&self,
|
||||
switch: &str,
|
||||
name: &str,
|
||||
ports: &[&str],
|
||||
) -> Result<(), ExecutorError>;
|
||||
}
|
||||
```
|
||||
|
||||
Then implement it on your Topology:
|
||||
|
||||
```rust
|
||||
use harmony_infra::brocade::BrocadeClient;
|
||||
|
||||
pub struct MyTopology {
|
||||
switch_client: Arc<dyn SwitchClient>,
|
||||
}
|
||||
|
||||
impl SwitchClient for MyTopology {
|
||||
async fn configure_port(&self, switch: &str, port: &str, vlan: u16) -> Result<(), ExecutorError> {
|
||||
self.switch_client.configure_port(switch, port, vlan).await
|
||||
}
|
||||
|
||||
async fn configure_port_channel(&self, switch: &str, name: &str, ports: &[&str]) -> Result<(), ExecutorError> {
|
||||
self.switch_client.configure_port_channel(switch, name, ports).await
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Now Scores that need `SwitchClient` can run on `MyTopology`.
|
||||
|
||||
## Capability Composition
|
||||
|
||||
Topologies often compose multiple Capabilities to support complex Scores:
|
||||
|
||||
```rust
|
||||
pub struct HAClusterTopology {
|
||||
pub kubeconfig: Option<String>,
|
||||
pub router: Arc<dyn Router>,
|
||||
pub load_balancer: Arc<dyn LoadBalancer>,
|
||||
pub switch_client: Arc<dyn SwitchClient>,
|
||||
pub dhcp_server: Arc<dyn DhcpServer>,
|
||||
pub dns_server: Arc<dyn DnsServer>,
|
||||
// ...
|
||||
}
|
||||
|
||||
impl K8sclient for HAClusterTopology { ... }
|
||||
impl HelmCommand for HAClusterTopology { ... }
|
||||
impl SwitchClient for HAClusterTopology { ... }
|
||||
impl DhcpServer for HAClusterTopology { ... }
|
||||
impl DnsServer for HAClusterTopology { ... }
|
||||
impl Router for HAClusterTopology { ... }
|
||||
impl LoadBalancer for HAClusterTopology { ... }
|
||||
```
|
||||
|
||||
A Score that needs all of these can run on `HAClusterTopology` because the Topology provides all of them.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Keep Capabilities focused** — one Capability per concern (Kubernetes client, Helm, switch config)
|
||||
- **Return meaningful errors** — use specific error types so Scores can handle failures appropriately
|
||||
- **Make Capabilities optional where sensible** — not every Topology needs every Capability; use `Option<T>` or a separate trait for optional features
|
||||
- **Document preconditions** — if a Capability requires the infrastructure to be in a specific state, document it in the trait doc comments
|
||||
40
docs/guides/developer-guide.md
Normal file
40
docs/guides/developer-guide.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Developer Guide
|
||||
|
||||
This section covers how to extend Harmony by building your own `Score`, `Topology`, and `Capability` implementations.
|
||||
|
||||
## Writing a Score
|
||||
|
||||
A `Score` is a declarative description of desired state. To create your own:
|
||||
|
||||
1. Define a struct that represents your desired state
|
||||
2. Implement the `Score<T>` trait, where `T` is your target `Topology`
|
||||
3. Implement the `Interpret<T>` trait to define how the Score translates to infrastructure actions
|
||||
|
||||
See the [Writing a Score](./writing-a-score.md) guide for a step-by-step walkthrough.
|
||||
|
||||
## Writing a Topology
|
||||
|
||||
A `Topology` models your infrastructure environment. To create your own:
|
||||
|
||||
1. Define a struct that holds your infrastructure configuration
|
||||
2. Implement the `Topology` trait
|
||||
3. Implement the `Capability` traits your Score needs
|
||||
|
||||
See the [Writing a Topology](./writing-a-topology.md) guide for details.
|
||||
|
||||
## Adding Capabilities
|
||||
|
||||
`Capabilities` are the specific APIs or features a `Topology` exposes. They are the bridge between Scores and the actual infrastructure.
|
||||
|
||||
See the [Adding Capabilities](./adding-capabilities.md) guide for details on implementing and exposing Capabilities.
|
||||
|
||||
## Core Traits Reference
|
||||
|
||||
| Trait | Purpose |
|
||||
|-------|---------|
|
||||
| `Score<T>` | Declares desired state ("what") |
|
||||
| `Topology` | Represents infrastructure ("where") |
|
||||
| `Interpret<T>` | Execution logic ("how") |
|
||||
| `Capability` | A feature exposed by a Topology |
|
||||
|
||||
See [Core Concepts](../concepts.md) for the conceptual foundation.
|
||||
@@ -1,42 +1,230 @@
|
||||
# Getting Started Guide
|
||||
|
||||
Welcome to Harmony! This guide will walk you through installing the Harmony framework, setting up a new project, and deploying your first application.
|
||||
This guide walks you through deploying your first application with Harmony — a PostgreSQL cluster on a local Kubernetes cluster (K3D). By the end, you'll understand the core workflow: compile a Score, run it through the Harmony CLI, and verify the result.
|
||||
|
||||
We will build and deploy the "Rust Web App" example, which automatically:
|
||||
## What you'll deploy
|
||||
|
||||
1. Provisions a local K3D (Kubernetes in Docker) cluster.
|
||||
2. Deploys a sample Rust web application.
|
||||
3. Sets up monitoring for the application.
|
||||
A fully functional PostgreSQL cluster running in a local K3D cluster, managed by the CloudNativePG operator. This demonstrates the full Harmony pattern:
|
||||
|
||||
1. Provision a local Kubernetes cluster (K3D)
|
||||
2. Install the required operator (CloudNativePG)
|
||||
3. Create a PostgreSQL cluster
|
||||
4. Expose it as a Kubernetes Service
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, you'll need a few tools installed on your system:
|
||||
Before you begin, install the following tools:
|
||||
|
||||
- **Rust & Cargo:** [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
- **Docker:** [Install Docker](https://docs.docker.com/get-docker/) (Required for the K3D local cluster)
|
||||
- **kubectl:** [Install kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For inspecting the cluster)
|
||||
- **Rust & Cargo:** [Install Rust](https://rust-lang.org/tools/install) (edition 2024)
|
||||
- **Docker:** [Install Docker](https://docs.docker.com/get-docker/) (required for the local K3D cluster)
|
||||
- **kubectl:** [Install kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (optional, for inspecting the cluster)
|
||||
|
||||
## 1. Install Harmony
|
||||
|
||||
First, clone the Harmony repository and build the project. This gives you the `harmony` CLI and all the core libraries.
|
||||
## Step 1: Clone and build
|
||||
|
||||
```bash
|
||||
# Clone the main repository
|
||||
# Clone the repository
|
||||
git clone https://git.nationtech.io/nationtech/harmony
|
||||
cd harmony
|
||||
|
||||
# Build the project (this may take a few minutes)
|
||||
# Build the project (this may take a few minutes on first run)
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
...
|
||||
## Step 2: Run the PostgreSQL example
|
||||
|
||||
## Next Steps
|
||||
```bash
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
Congratulations, you've just deployed an application using true infrastructure-as-code!
|
||||
Harmony will output its progress as it:
|
||||
|
||||
From here, you can:
|
||||
1. **Creates a K3D cluster** named `harmony-postgres-example` (first run only)
|
||||
2. **Installs the CloudNativePG operator** into the cluster
|
||||
3. **Creates a PostgreSQL cluster** with 1 instance and 1 GiB of storage
|
||||
4. **Prints connection details** for your new database
|
||||
|
||||
- [Explore the Catalogs](../catalogs/README.md): See what other [Scores](../catalogs/scores.md) and [Topologies](../catalogs/topologies.md) are available.
|
||||
- [Read the Use Cases](../use-cases/README.md): Check out the [OKD on Bare Metal](./use-cases/okd-on-bare-metal.md) guide for a more advanced scenario.
|
||||
- [Write your own Score](../guides/writing-a-score.md): Dive into the [Developer Guide](./guides/developer-guide.md) to start building your own components.
|
||||
Expected output (abbreviated):
|
||||
|
||||
```
|
||||
[+] Cluster created
|
||||
[+] Installing CloudNativePG operator
|
||||
[+] Creating PostgreSQL cluster
|
||||
[+] PostgreSQL cluster is ready
|
||||
Namespace: harmony-postgres-example
|
||||
Service: harmony-postgres-example-rw
|
||||
Username: postgres
|
||||
Password: <stored in secret harmony-postgres-example-db-user>
|
||||
```
|
||||
|
||||
## Step 3: Verify the deployment
|
||||
|
||||
Check that the PostgreSQL pods are running:
|
||||
|
||||
```bash
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
harmony-postgres-example-1 1/1 Running 0 2m
|
||||
```
|
||||
|
||||
Get the database password:
|
||||
|
||||
```bash
|
||||
kubectl get secret -n harmony-postgres-example harmony-postgres-example-db-user -o jsonpath='{.data.password}' | base64 -d
|
||||
```
|
||||
|
||||
## Step 4: Connect to the database
|
||||
|
||||
Forward the PostgreSQL port to your local machine:
|
||||
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
```
|
||||
|
||||
In another terminal, connect with `psql`:
|
||||
|
||||
```bash
|
||||
psql -h localhost -p 5432 -U postgres
|
||||
# Enter the password from Step 4 when prompted
|
||||
```
|
||||
|
||||
Try a simple query:
|
||||
|
||||
```sql
|
||||
SELECT version();
|
||||
```
|
||||
|
||||
## Step 5: Clean up
|
||||
|
||||
To delete the PostgreSQL cluster and the local K3D cluster:
|
||||
|
||||
```bash
|
||||
k3d cluster delete harmony-postgres-example
|
||||
```
|
||||
|
||||
Alternatively, just delete the PostgreSQL cluster without removing K3D:
|
||||
|
||||
```bash
|
||||
kubectl delete namespace harmony-postgres-example
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
The example code (`examples/postgresql/src/main.rs`) is straightforward:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
- **`Inventory::autoload()`** discovers the local environment (or uses an existing inventory)
|
||||
- **`K8sAnywhereTopology::from_env()`** connects to K3D if `HARMONY_AUTOINSTALL=true` (the default), or to any Kubernetes cluster via `KUBECONFIG`
|
||||
- **`harmony_cli::run(...)`** executes the Score against the Topology, managing the full lifecycle
|
||||
|
||||
## Connecting to an existing cluster
|
||||
|
||||
By default, Harmony provisions a local K3D cluster. To use an existing Kubernetes cluster instead:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Docker is not running
|
||||
|
||||
```
|
||||
Error: could not create cluster: docker is not running
|
||||
```
|
||||
|
||||
Start Docker and try again.
|
||||
|
||||
### K3D cluster creation fails
|
||||
|
||||
```
|
||||
Error: failed to create k3d cluster
|
||||
```
|
||||
|
||||
Ensure you have at least 2 CPU cores and 4 GiB of RAM available for Docker.
|
||||
|
||||
### `kubectl` cannot connect to the cluster
|
||||
|
||||
```
|
||||
error: unable to connect to a kubernetes cluster
|
||||
```
|
||||
|
||||
After Harmony creates the cluster, it writes the kubeconfig to `~/.kube/config` or to the path in `KUBECONFIG`. Verify:
|
||||
|
||||
```bash
|
||||
kubectl cluster-info --context k3d-harmony-postgres-example
|
||||
```
|
||||
|
||||
### Port forward fails
|
||||
|
||||
```
|
||||
error: unable to forward port
|
||||
```
|
||||
|
||||
Make sure no other process is using port 5432, or use a different local port:
|
||||
|
||||
```bash
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 15432:5432
|
||||
psql -h localhost -p 15432 -U postgres
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Explore the Scores Catalog](../catalogs/scores.md): See what other Scores are available
|
||||
- [Explore the Topologies Catalog](../catalogs/topologies.md): See what infrastructure Topologies are supported
|
||||
- [Read the Core Concepts](../concepts.md): Understand the Score / Topology / Interpret pattern in depth
|
||||
- [OKD on Bare Metal](../use-cases/okd-on-bare-metal.md): See a complete bare-metal deployment example
|
||||
|
||||
## Advanced examples
|
||||
|
||||
Once you're comfortable with the basics, these examples demonstrate more advanced use cases. Note that some require specific infrastructure (existing Kubernetes clusters, bare-metal hardware, or multi-cluster environments):
|
||||
|
||||
| Example | Description | Prerequisites |
|
||||
|---------|-------------|---------------|
|
||||
| `monitoring` | Deploy Prometheus alerting with Discord webhooks | Existing K8s cluster |
|
||||
| `ntfy` | Deploy ntfy notification server | Existing K8s cluster |
|
||||
| `tenant` | Create a multi-tenant namespace with quotas | Existing K8s cluster |
|
||||
| `cert_manager` | Provision TLS certificates | Existing K8s cluster |
|
||||
| `validate_ceph_cluster_health` | Check Ceph cluster health | Existing Rook/Ceph cluster |
|
||||
| `okd_pxe` / `okd_installation` | Provision OKD on bare metal | HAClusterTopology, bare-metal hardware |
|
||||
|
||||
To run any example:
|
||||
|
||||
```bash
|
||||
cargo run -p example-<example_name>
|
||||
```
|
||||
|
||||
158
docs/guides/kubernetes-ingress.md
Normal file
158
docs/guides/kubernetes-ingress.md
Normal file
@@ -0,0 +1,158 @@
|
||||
# Ingress Resources in Harmony
|
||||
|
||||
Harmony generates standard Kubernetes `networking.k8s.io/v1` Ingress resources. This ensures your deployments are portable across any Kubernetes distribution (vanilla K8s, OKD/OpenShift, K3s, etc.) without requiring vendor-specific configurations.
|
||||
|
||||
By default, Harmony does **not** set `spec.ingressClassName`. This allows the cluster's default ingress controller to automatically claim the resource, which is the correct approach for most single-controller clusters.
|
||||
|
||||
---
|
||||
|
||||
## TLS Configurations
|
||||
|
||||
There are two portable TLS modes for Ingress resources. Use only these in your Harmony deployments.
|
||||
|
||||
### 1. Plain HTTP (No TLS)
|
||||
|
||||
Omit the `tls` block entirely. The Ingress serves traffic over plain HTTP. Use this for local development or when TLS is terminated elsewhere (e.g., by a service mesh or external load balancer).
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: my-ns
|
||||
spec:
|
||||
rules:
|
||||
- host: app.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: my-app
|
||||
port:
|
||||
number: 8080
|
||||
```
|
||||
|
||||
### 2. HTTPS with a Named TLS Secret
|
||||
|
||||
Provide a `tls` block with both `hosts` and a `secretName`. The ingress controller will use that Secret for TLS termination. The Secret must be a `kubernetes.io/tls` type in the same namespace as the Ingress.
|
||||
|
||||
There are two ways to provide this Secret.
|
||||
|
||||
#### Option A: Manual Secret
|
||||
|
||||
Create the TLS Secret yourself before deploying the Ingress. This is suitable when certificates are issued outside the cluster or managed by another system.
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: my-ns
|
||||
spec:
|
||||
rules:
|
||||
- host: app.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: my-app
|
||||
port:
|
||||
number: 8080
|
||||
tls:
|
||||
- hosts:
|
||||
- app.example.com
|
||||
secretName: app-example-com-tls
|
||||
```
|
||||
|
||||
#### Option B: Automated via cert-manager (Recommended)
|
||||
|
||||
Add the `cert-manager.io/cluster-issuer` annotation to the Ingress. cert-manager will automatically perform the ACME challenge, generate the certificate, store it in the named Secret, and handle renewal. You do not create the Secret yourself.
|
||||
|
||||
```yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: my-app
|
||||
namespace: my-ns
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
spec:
|
||||
rules:
|
||||
- host: app.example.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: my-app
|
||||
port:
|
||||
number: 8080
|
||||
tls:
|
||||
- hosts:
|
||||
- app.example.com
|
||||
secretName: app-example-com-tls
|
||||
```
|
||||
|
||||
If you use a namespace-scoped `Issuer` instead of a `ClusterIssuer`, replace the annotation with `cert-manager.io/issuer: <name>`.
|
||||
|
||||
---
|
||||
|
||||
## Do Not Use: TLS Without `secretName`
|
||||
|
||||
Avoid TLS entries that omit `secretName`:
|
||||
|
||||
```yaml
|
||||
# ⚠️ Non-portable — do not use
|
||||
tls:
|
||||
- hosts:
|
||||
- app.example.com
|
||||
```
|
||||
|
||||
Behavior for this pattern is **controller-specific and not portable**. On OKD/OpenShift, the ingress-to-route translation rejects it as incomplete. On other controllers, it may silently serve a self-signed fallback or fail in unpredictable ways. Harmony does not support this pattern.
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites for cert-manager
|
||||
|
||||
To use automated certificates (Option B above):
|
||||
|
||||
1. **cert-manager** must be installed on the cluster.
|
||||
2. A `ClusterIssuer` or `Issuer` must exist. A typical Let's Encrypt production issuer:
|
||||
|
||||
```yaml
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-prod
|
||||
spec:
|
||||
acme:
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
email: team@example.com
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-prod-account-key
|
||||
solvers:
|
||||
- http01:
|
||||
ingress: {}
|
||||
```
|
||||
|
||||
3. **DNS must already resolve** to the cluster's ingress endpoint before the Ingress is created. The HTTP01 challenge requires this routing to be active.
|
||||
|
||||
For wildcard certificates (e.g. `*.example.com`), HTTP01 cannot be used — configure a DNS01 solver with credentials for your DNS provider instead.
|
||||
|
||||
---
|
||||
|
||||
## OKD / OpenShift Notes
|
||||
|
||||
On OKD, standard Ingress resources are automatically translated into OpenShift `Route` objects. The default TLS termination mode is `edge`, which is correct for most HTTP applications. To control this explicitly, add:
|
||||
|
||||
```yaml
|
||||
annotations:
|
||||
route.openshift.io/termination: edge # or passthrough / reencrypt
|
||||
```
|
||||
|
||||
This annotation is ignored on non-OpenShift clusters and is safe to include unconditionally.
|
||||
164
docs/guides/writing-a-score.md
Normal file
164
docs/guides/writing-a-score.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Writing a Score
|
||||
|
||||
A `Score` declares _what_ you want to achieve. It is decoupled from _how_ it is achieved — that logic lives in an `Interpret`.
|
||||
|
||||
## The Pattern
|
||||
|
||||
A Score consists of two parts:
|
||||
|
||||
1. **A struct** — holds the configuration for your desired state
|
||||
2. **A `Score<T>` implementation** — returns an `Interpret` that knows how to execute
|
||||
|
||||
An `Interpret` contains the actual execution logic and connects your Score to the capabilities exposed by a `Topology`.
|
||||
|
||||
## Example: A Simple Score
|
||||
|
||||
Here's a simplified version of `NtfyScore` from the `ntfy` module:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use harmony::{
|
||||
interpret::{Interpret, InterpretError, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{HelmCommand, K8sclient, Topology},
|
||||
};
|
||||
|
||||
/// MyScore declares "I want to install the ntfy server"
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MyScore {
|
||||
pub namespace: String,
|
||||
pub host: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + HelmCommand + K8sclient> Score<T> for MyScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(MyInterpret { score: self.clone() })
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"ntfy [MyScore]".into()
|
||||
}
|
||||
}
|
||||
|
||||
/// MyInterpret knows _how_ to install ntfy using the Topology's capabilities
|
||||
#[derive(Debug)]
|
||||
pub struct MyInterpret {
|
||||
pub score: MyScore,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + HelmCommand + K8sclient> Interpret<T> for MyInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
// 1. Get a Kubernetes client from the Topology
|
||||
let client = topology.k8s_client().await?;
|
||||
|
||||
// 2. Use Helm to install the ntfy chart
|
||||
// (via topology's HelmCommand capability)
|
||||
|
||||
// 3. Wait for the deployment to be ready
|
||||
client
|
||||
.wait_until_deployment_ready("ntfy", Some(&self.score.namespace), None)
|
||||
.await?;
|
||||
|
||||
Ok(Outcome::success("ntfy installed".to_string()))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## The Compile-Time Safety Check
|
||||
|
||||
The generic `Score<T>` trait is bounded by `T: Topology`. This means the compiler enforces that your Score only runs on Topologies that expose the capabilities your Interpret needs:
|
||||
|
||||
```rust
|
||||
// This only compiles if K8sAnywhereTopology (or any T)
|
||||
// implements HelmCommand and K8sclient
|
||||
impl<T: Topology + HelmCommand + K8sclient> Score<T> for MyScore { ... }
|
||||
```
|
||||
|
||||
If you try to run this Score against a Topology that doesn't expose `HelmCommand`, you get a compile error — before any code runs.
|
||||
|
||||
## Using Your Score
|
||||
|
||||
Once defined, your Score integrates with the Harmony CLI:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let my_score = MyScore {
|
||||
namespace: "monitoring".to_string(),
|
||||
host: "ntfy.example.com".to_string(),
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(my_score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
## Key Patterns
|
||||
|
||||
### Composing Scores
|
||||
|
||||
Scores can include other Scores via features:
|
||||
|
||||
```rust
|
||||
let app = ApplicationScore {
|
||||
features: vec![
|
||||
Box::new(PackagingDeployment { application: app.clone() }),
|
||||
Box::new(Monitoring { application: app.clone(), alert_receiver: vec![] }),
|
||||
],
|
||||
application: app,
|
||||
};
|
||||
```
|
||||
|
||||
### Reusing Interpret Logic
|
||||
|
||||
Many Scores delegate to shared `Interpret` implementations. For example, `HelmChartScore` provides a reusable Interpret for any Helm-based deployment. Your Score can wrap it:
|
||||
|
||||
```rust
|
||||
impl<T: Topology + HelmCommand> Score<T> for MyScore {
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(HelmChartInterpret { /* your config */ })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Accessing Topology Capabilities
|
||||
|
||||
Your Interpret accesses infrastructure through Capabilities exposed by the Topology:
|
||||
|
||||
```rust
|
||||
// Via the Topology trait directly
|
||||
let k8s_client = topology.k8s_client().await?;
|
||||
let helm = topology.get_helm_command();
|
||||
|
||||
// Or via Capability traits
|
||||
impl<T: Topology + K8sclient> Interpret<T> for MyInterpret {
|
||||
async fn execute(...) {
|
||||
let client = topology.k8s_client().await?;
|
||||
// use client...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Keep Scores focused** — one Score per concern (deployment, monitoring, networking)
|
||||
- **Use `..Default::default()`** for optional fields so callers only need to specify what they care about
|
||||
- **Return `Outcome`** — use `Outcome::success`, `Outcome::failure`, or `Outcome::success_with_details` to communicate results clearly
|
||||
- **Handle errors gracefully** — return meaningful `InterpretError` messages that help operators debug issues
|
||||
176
docs/guides/writing-a-topology.md
Normal file
176
docs/guides/writing-a-topology.md
Normal file
@@ -0,0 +1,176 @@
|
||||
# Writing a Topology
|
||||
|
||||
A `Topology` models your infrastructure environment and exposes `Capability` traits that Scores use to interact with it. Where a Score declares _what_ you want, a Topology exposes _what_ it can do.
|
||||
|
||||
## The Minimum Implementation
|
||||
|
||||
At minimum, a Topology needs:
|
||||
|
||||
```rust
|
||||
use async_trait::async_trait;
|
||||
use harmony::{
|
||||
topology::{PreparationError, PreparationOutcome, Topology},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MyTopology {
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Topology for MyTopology {
|
||||
fn name(&self) -> &str {
|
||||
"MyTopology"
|
||||
}
|
||||
|
||||
async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
|
||||
// Verify the infrastructure is accessible and ready
|
||||
Ok(PreparationOutcome::Success { details: "ready".to_string() })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Implementing Capabilities
|
||||
|
||||
Scores express dependencies on Capabilities through trait bounds. For example, if your Topology should support Scores that deploy Helm charts, implement `HelmCommand`:
|
||||
|
||||
```rust
|
||||
use std::process::Command;
|
||||
use harmony::topology::HelmCommand;
|
||||
|
||||
impl HelmCommand for MyTopology {
|
||||
fn get_helm_command(&self) -> Command {
|
||||
let mut cmd = Command::new("helm");
|
||||
if let Some(kubeconfig) = &self.kubeconfig {
|
||||
cmd.arg("--kubeconfig").arg(kubeconfig);
|
||||
}
|
||||
cmd
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
For Scores that need a Kubernetes client, implement `K8sclient`:
|
||||
|
||||
```rust
|
||||
use std::sync::Arc;
|
||||
use harmony_k8s::K8sClient;
|
||||
use harmony::topology::K8sclient;
|
||||
|
||||
#[async_trait]
|
||||
impl K8sclient for MyTopology {
|
||||
async fn k8s_client(&self) -> Result<Arc<K8sClient>, String> {
|
||||
let client = if let Some(kubeconfig) = &self.kubeconfig {
|
||||
K8sClient::from_kubeconfig(kubeconfig).await?
|
||||
} else {
|
||||
K8sClient::try_default().await?
|
||||
};
|
||||
Ok(Arc::new(client))
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Loading Topology from Environment
|
||||
|
||||
For flexibility, implement `from_env()` to read configuration from environment variables:
|
||||
|
||||
```rust
|
||||
impl MyTopology {
|
||||
pub fn from_env() -> Self {
|
||||
Self {
|
||||
name: std::env::var("MY_TOPOLOGY_NAME")
|
||||
.unwrap_or_else(|_| "default".to_string()),
|
||||
kubeconfig: std::env::var("KUBECONFIG").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This pattern lets operators switch between environments without recompiling:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/prod-cluster.kubeconfig
|
||||
cargo run --example my_example
|
||||
```
|
||||
|
||||
## Complete Example: K8sAnywhereTopology
|
||||
|
||||
The `K8sAnywhereTopology` is the most commonly used Topology and handles both local (K3D) and remote Kubernetes clusters:
|
||||
|
||||
```rust
|
||||
pub struct K8sAnywhereTopology {
|
||||
pub k8s_state: Arc<OnceCell<K8sState>>,
|
||||
pub tenant_manager: Arc<OnceCell<TenantManager>>,
|
||||
pub config: Arc<K8sAnywhereConfig>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Topology for K8sAnywhereTopology {
|
||||
fn name(&self) -> &str {
|
||||
"K8sAnywhereTopology"
|
||||
}
|
||||
|
||||
async fn ensure_ready(&self) -> Result<PreparationOutcome, PreparationError> {
|
||||
// 1. If autoinstall is enabled and no cluster exists, provision K3D
|
||||
// 2. Verify kubectl connectivity
|
||||
// 3. Optionally wait for cluster operators to be ready
|
||||
Ok(PreparationOutcome::Success { details: "cluster ready".to_string() })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Key Patterns
|
||||
|
||||
### Lazy Initialization
|
||||
|
||||
Use `OnceCell` for expensive resources like Kubernetes clients:
|
||||
|
||||
```rust
|
||||
pub struct K8sAnywhereTopology {
|
||||
k8s_state: Arc<OnceCell<K8sState>>,
|
||||
}
|
||||
```
|
||||
|
||||
### Multi-Target Topologies
|
||||
|
||||
For Scores that span multiple clusters (like NATS supercluster), implement `MultiTargetTopology`:
|
||||
|
||||
```rust
|
||||
pub trait MultiTargetTopology: Topology {
|
||||
fn current_target(&self) -> &str;
|
||||
fn set_target(&mut self, target: &str);
|
||||
}
|
||||
```
|
||||
|
||||
### Composing Topologies
|
||||
|
||||
Complex topologies combine multiple infrastructure components:
|
||||
|
||||
```rust
|
||||
pub struct HAClusterTopology {
|
||||
pub router: Arc<dyn Router>,
|
||||
pub load_balancer: Arc<dyn LoadBalancer>,
|
||||
pub firewall: Arc<dyn Firewall>,
|
||||
pub dhcp_server: Arc<dyn DhcpServer>,
|
||||
pub dns_server: Arc<dyn DnsServer>,
|
||||
pub kubeconfig: Option<String>,
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Your Topology
|
||||
|
||||
Test Topologies in isolation by implementing them against mock infrastructure:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_topology_ensure_ready() {
|
||||
let topo = MyTopology::from_env();
|
||||
let result = topo.ensure_ready().await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
```
|
||||
16
docs/one_liners.md
Normal file
16
docs/one_liners.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# Handy one liners for infrastructure management
|
||||
|
||||
### Delete all evicted pods from a cluster
|
||||
|
||||
```sh
|
||||
kubectl get po -A | grep Evic | awk '{ print "-n " $1 " " $2 }' | xargs -L 1 kubectl delete po
|
||||
```
|
||||
> Pods are evicted when the node they are running on lacks the ressources to keep them going. The most common case is when ephemeral storage becomes too full because of something like a log file getting too big.
|
||||
>
|
||||
> It could also happen because of memory or cpu pressure due to unpredictable workloads.
|
||||
>
|
||||
> This means it is generally ok to delete them.
|
||||
>
|
||||
> However, in a perfectly configured deployment and cluster, pods should rarely, if ever, get evicted. For example, a log file getting too big should be reconfigured not to use too much space, or the deployment should be configured to reserve the correct amount of ephemeral storage space.
|
||||
>
|
||||
> Note that deleting evicted pods do not solve the underlying issue, make sure to understand why the pod was evicted in the first place and put the proper solution in place.
|
||||
17
docs/use-cases/README.md
Normal file
17
docs/use-cases/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
# Use Cases
|
||||
|
||||
Real-world scenarios demonstrating Harmony in action.
|
||||
|
||||
## Available Use Cases
|
||||
|
||||
### [PostgreSQL on Local K3D](./postgresql-on-local-k3d.md)
|
||||
|
||||
Deploy a fully functional PostgreSQL cluster on a local K3D cluster in under 10 minutes. The quickest way to see Harmony in action.
|
||||
|
||||
### [OKD on Bare Metal](./okd-on-bare-metal.md)
|
||||
|
||||
A complete walkthrough of bootstrapping a high-availability OKD cluster from physical hardware. Covers inventory discovery, bootstrap, control plane, and worker provisioning.
|
||||
|
||||
---
|
||||
|
||||
_These use cases are community-tested scenarios. For questions or contributions, open an issue on the [Harmony repository](https://git.nationtech.io/NationTech/harmony/issues)._
|
||||
159
docs/use-cases/okd-on-bare-metal.md
Normal file
159
docs/use-cases/okd-on-bare-metal.md
Normal file
@@ -0,0 +1,159 @@
|
||||
# Use Case: OKD on Bare Metal
|
||||
|
||||
Provision a production-grade OKD (OpenShift Kubernetes Distribution) cluster from physical hardware using Harmony. This use case covers the full lifecycle: hardware discovery, bootstrap, control plane, workers, and post-install validation.
|
||||
|
||||
## What you'll have at the end
|
||||
|
||||
A highly-available OKD cluster with:
|
||||
- 3 control plane nodes
|
||||
- 2+ worker nodes
|
||||
- Network bonding configured on nodes and switches
|
||||
- Load balancer routing API and ingress traffic
|
||||
- DNS and DHCP services for the cluster
|
||||
- Post-install health validation
|
||||
|
||||
## Target hardware model
|
||||
|
||||
This setup assumes a typical lab environment:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Network 192.168.x.0/24 (flat, DHCP + PXE capable) │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ cp0 │ │ cp1 │ │ cp2 │ (control) │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ │
|
||||
│ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ wk0 │ │ wk1 │ ... (workers) │
|
||||
│ └──────────┘ └──────────┘ │
|
||||
│ ┌──────────┐ │
|
||||
│ │ bootstrap│ (temporary, can be repurposed) │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ firewall │ │ switch │ (OPNsense + Brocade) │
|
||||
│ └──────────┘ └──────────┘ │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Required infrastructure
|
||||
|
||||
Harmony models this as an `HAClusterTopology`, which requires these capabilities:
|
||||
|
||||
| Capability | Implementation |
|
||||
|------------|---------------|
|
||||
| **Router** | OPNsense firewall |
|
||||
| **Load Balancer** | OPNsense HAProxy |
|
||||
| **Firewall** | OPNsense |
|
||||
| **DHCP Server** | OPNsense |
|
||||
| **TFTP Server** | OPNsense |
|
||||
| **HTTP Server** | OPNsense |
|
||||
| **DNS Server** | OPNsense |
|
||||
| **Node Exporter** | Prometheus node_exporter on OPNsense |
|
||||
| **Switch Client** | Brocade SNMP |
|
||||
|
||||
See `examples/okd_installation/` for a reference topology implementation.
|
||||
|
||||
## The Provisioning Pipeline
|
||||
|
||||
Harmony orchestrates OKD installation in ordered stages:
|
||||
|
||||
### Stage 1: Inventory Discovery (`OKDSetup01InventoryScore`)
|
||||
|
||||
Harmony boots all nodes via PXE into a CentOS Stream live environment, runs an inventory agent on each, and collects:
|
||||
- MAC addresses and NIC details
|
||||
- IP addresses assigned by DHCP
|
||||
- Hardware profile (CPU, RAM, storage)
|
||||
|
||||
This is the "discovery-first" approach: no pre-configuration required on nodes.
|
||||
|
||||
### Stage 2: Bootstrap Node (`OKDSetup02BootstrapScore`)
|
||||
|
||||
The user selects one discovered node to serve as the bootstrap node. Harmony:
|
||||
- Renders per-MAC iPXE boot configuration with OKD 4.19 SCOS live assets + ignition
|
||||
- Reboots the bootstrap node via SSH
|
||||
- Waits for the bootstrap process to complete (API server becomes available)
|
||||
|
||||
### Stage 3: Control Plane (`OKDSetup03ControlPlaneScore`)
|
||||
|
||||
With bootstrap complete, Harmony provisions the control plane nodes:
|
||||
- Renders per-MAC iPXE for each control plane node
|
||||
- Reboots via SSH and waits for node to join the cluster
|
||||
- Applies network bond configuration via NMState MachineConfig where relevant
|
||||
|
||||
### Stage 4: Network Bonding (`OKDSetupPersistNetworkBondScore`)
|
||||
|
||||
Configures LACP bonds on nodes and corresponding port-channels on the switch stack for high-availability.
|
||||
|
||||
### Stage 5: Worker Nodes (`OKDSetup04WorkersScore`)
|
||||
|
||||
Provisions worker nodes similarly to control plane, joining them to the cluster.
|
||||
|
||||
### Stage 6: Sanity Check (`OKDSetup05SanityCheckScore`)
|
||||
|
||||
Validates:
|
||||
- API server is reachable
|
||||
- Ingress controller is operational
|
||||
- Cluster operators are healthy
|
||||
- SDN (software-defined networking) is functional
|
||||
|
||||
### Stage 7: Installation Report (`OKDSetup06InstallationReportScore`)
|
||||
|
||||
Produces a machine-readable JSON report and human-readable summary of the installation.
|
||||
|
||||
## Network notes
|
||||
|
||||
**During discovery:** Ports must be in access mode (no LACP). DHCP succeeds; iPXE loads CentOS Stream live with Kickstart and starts the inventory endpoint.
|
||||
|
||||
**During provisioning:** After SCOS is on disk and Ignition/MachineConfig can be applied, bonds are set persistently. This avoids the PXE/DHCP recovery race condition that occurs if bonding is configured too early.
|
||||
|
||||
**PXE limitation:** The generic discovery path cannot use bonded networks for PXE boot because the DHCP recovery process conflicts with bond formation.
|
||||
|
||||
## Configuration knobs
|
||||
|
||||
When using `OKDInstallationPipeline`, configure these domains:
|
||||
|
||||
| Parameter | Example | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `public_domain` | `apps.example.com` | Wildcard domain for application ingress |
|
||||
| `internal_domain` | `cluster.local` | Internal cluster DNS domain |
|
||||
|
||||
## Running the example
|
||||
|
||||
See `examples/okd_installation/` for a complete reference. The topology must be configured with your infrastructure details:
|
||||
|
||||
```bash
|
||||
# Configure the example with your hardware/network specifics
|
||||
# See examples/okd_installation/src/topology.rs
|
||||
|
||||
cargo run -p example-okd_installation
|
||||
```
|
||||
|
||||
This example requires:
|
||||
- Physical hardware configured as described above
|
||||
- OPNsense firewall with SSH access
|
||||
- Brocade switch with SNMP access
|
||||
- All nodes connected to the same Layer 2 network
|
||||
|
||||
## Post-install
|
||||
|
||||
After the cluster is bootstrapped, `~/.kube/config` is updated with the cluster credentials. Verify:
|
||||
|
||||
```bash
|
||||
kubectl get nodes
|
||||
kubectl get pods -n openshift-monitoring
|
||||
oc get routes -n openshift-console
|
||||
```
|
||||
|
||||
## Next steps
|
||||
|
||||
- Enable monitoring with `PrometheusAlertScore` or `OpenshiftClusterAlertScore`
|
||||
- Configure TLS certificates with `CertManagerHelmScore`
|
||||
- Add storage with Rook Ceph
|
||||
- Scale workers with `OKDSetup04WorkersScore`
|
||||
|
||||
## Further reading
|
||||
|
||||
- [OKD Installation Module](../../harmony/src/modules/okd/installation.rs) — source of truth for pipeline stages
|
||||
- [HAClusterTopology](../../harmony/src/domain/topology/ha_cluster.rs) — infrastructure capability model
|
||||
- [Scores Catalog](../catalogs/scores.md) — all available Scores including OKD-specific ones
|
||||
115
docs/use-cases/postgresql-on-local-k3d.md
Normal file
115
docs/use-cases/postgresql-on-local-k3d.md
Normal file
@@ -0,0 +1,115 @@
|
||||
# Use Case: PostgreSQL on Local K3D
|
||||
|
||||
Deploy a production-grade PostgreSQL cluster on a local Kubernetes cluster (K3D) using Harmony. This is the fastest way to get started with Harmony and requires no external infrastructure.
|
||||
|
||||
## What you'll have at the end
|
||||
|
||||
A fully operational PostgreSQL cluster with:
|
||||
- 1 primary instance with 1 GiB of storage
|
||||
- CloudNativePG operator managing the cluster lifecycle
|
||||
- Automatic failover support (foundation for high-availability)
|
||||
- Exposed as a Kubernetes Service for easy connection
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Rust 2024 edition
|
||||
- Docker running locally
|
||||
- ~5 minutes
|
||||
|
||||
## The Score
|
||||
|
||||
The entire deployment is expressed in ~20 lines of Rust:
|
||||
|
||||
```rust
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::postgresql::{PostgreSQLScore, capability::PostgreSQLConfig},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "harmony-postgres-example".to_string(),
|
||||
namespace: "harmony-postgres-example".to_string(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
vec![Box::new(postgres)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
```
|
||||
|
||||
## What Harmony does
|
||||
|
||||
When you run this, Harmony:
|
||||
|
||||
1. **Connects to K8sAnywhereTopology** — this auto-provisions a K3D cluster if none exists
|
||||
2. **Installs the CloudNativePG operator** — one-time setup that enables PostgreSQL cluster management in Kubernetes
|
||||
3. **Creates a PostgreSQL cluster** — Harmony translates the Score into a `Cluster` CRD and applies it
|
||||
4. **Exposes the database** — creates a Kubernetes Service for the PostgreSQL primary
|
||||
|
||||
## Running it
|
||||
|
||||
```bash
|
||||
cargo run -p example-postgresql
|
||||
```
|
||||
|
||||
## Verifying the deployment
|
||||
|
||||
```bash
|
||||
# Check pods
|
||||
kubectl get pods -n harmony-postgres-example
|
||||
|
||||
# Get the password
|
||||
PASSWORD=$(kubectl get secret -n harmony-postgres-example \
|
||||
harmony-postgres-example-db-user \
|
||||
-o jsonpath='{.data.password}' | base64 -d)
|
||||
|
||||
# Connect via port-forward
|
||||
kubectl port-forward -n harmony-postgres-example svc/harmony-postgres-example-rw 5432:5432
|
||||
psql -h localhost -p 5432 -U postgres -W "$PASSWORD"
|
||||
```
|
||||
|
||||
## Customizing the deployment
|
||||
|
||||
The `PostgreSQLConfig` struct supports:
|
||||
|
||||
| Field | Default | Description |
|
||||
|-------|---------|-------------|
|
||||
| `cluster_name` | — | Name of the PostgreSQL cluster |
|
||||
| `namespace` | — | Kubernetes namespace to deploy to |
|
||||
| `instances` | `1` | Number of instances |
|
||||
| `storage_size` | `1Gi` | Persistent storage size per instance |
|
||||
|
||||
Example with custom settings:
|
||||
|
||||
```rust
|
||||
let postgres = PostgreSQLScore {
|
||||
config: PostgreSQLConfig {
|
||||
cluster_name: "my-prod-db".to_string(),
|
||||
namespace: "database".to_string(),
|
||||
instances: 3,
|
||||
storage_size: "10Gi".to_string().into(),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
## Extending the pattern
|
||||
|
||||
This pattern extends to any Kubernetes-native workload:
|
||||
|
||||
- Add **monitoring** by including a `Monitoring` feature alongside your Score
|
||||
- Add **TLS certificates** by including a `CertificateScore`
|
||||
- Add **tenant isolation** by wrapping in a `TenantScore`
|
||||
|
||||
See [Scores Catalog](../catalogs/scores.md) for the full list.
|
||||
127
examples/README.md
Normal file
127
examples/README.md
Normal file
@@ -0,0 +1,127 @@
|
||||
# Examples
|
||||
|
||||
This directory contains runnable examples demonstrating Harmony's capabilities. Each example is a self-contained program that can be run with `cargo run -p example-<name>`.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Example | Description | Local K3D | Existing Cluster | Hardware Needed |
|
||||
|---------|-------------|:---------:|:----------------:|:---------------:|
|
||||
| `postgresql` | Deploy a PostgreSQL cluster | ✅ | ✅ | — |
|
||||
| `ntfy` | Deploy ntfy notification server | ✅ | ✅ | — |
|
||||
| `tenant` | Create a multi-tenant namespace | ✅ | ✅ | — |
|
||||
| `cert_manager` | Provision TLS certificates | ✅ | ✅ | — |
|
||||
| `node_health` | Check Kubernetes node health | ✅ | ✅ | — |
|
||||
| `monitoring` | Deploy Prometheus alerting | ✅ | ✅ | — |
|
||||
| `monitoring_with_tenant` | Monitoring + tenant isolation | ✅ | ✅ | — |
|
||||
| `operatorhub_catalog` | Install OperatorHub catalog | ✅ | ✅ | — |
|
||||
| `validate_ceph_cluster_health` | Verify Ceph cluster health | — | ✅ | Rook/Ceph |
|
||||
| `remove_rook_osd` | Remove a Rook OSD | — | ✅ | Rook/Ceph |
|
||||
| `brocade_snmp_server` | Configure Brocade switch SNMP | — | ✅ | Brocade switch |
|
||||
| `opnsense_node_exporter` | Node exporter on OPNsense | — | ✅ | OPNsense firewall |
|
||||
| `okd_pxe` | PXE boot configuration for OKD | — | — | ✅ |
|
||||
| `okd_installation` | Full OKD bare-metal install | — | — | ✅ |
|
||||
| `okd_cluster_alerts` | OKD cluster monitoring alerts | — | ✅ | OKD cluster |
|
||||
| `multisite_postgres` | Multi-site PostgreSQL failover | — | ✅ | Multi-cluster |
|
||||
| `nats` | Deploy NATS messaging | — | ✅ | Multi-cluster |
|
||||
| `nats-supercluster` | NATS supercluster across sites | — | ✅ | Multi-cluster |
|
||||
| `lamp` | LAMP stack deployment | ✅ | ✅ | — |
|
||||
| `openbao` | Deploy OpenBao vault | ✅ | ✅ | — |
|
||||
| `zitadel` | Deploy Zitadel identity provider | ✅ | ✅ | — |
|
||||
| `try_rust_webapp` | Rust webapp with packaging | ✅ | ✅ | Submodule |
|
||||
| `rust` | Rust webapp with full monitoring | ✅ | ✅ | — |
|
||||
| `rhob_application_monitoring` | RHOB monitoring setup | ✅ | ✅ | — |
|
||||
| `sttest` | Full OKD stack test | — | — | ✅ |
|
||||
| `application_monitoring_with_tenant` | App monitoring + tenant | — | ✅ | OKD cluster |
|
||||
| `kube-rs` | Direct kube-rs client usage | ✅ | ✅ | — |
|
||||
| `k8s_drain_node` | Drain a Kubernetes node | ✅ | ✅ | — |
|
||||
| `k8s_write_file_on_node` | Write files to K8s nodes | ✅ | ✅ | — |
|
||||
| `harmony_inventory_builder` | Discover hosts via subnet scan | ✅ | — | — |
|
||||
| `cli` | CLI tool with inventory discovery | ✅ | — | — |
|
||||
| `tui` | Terminal UI demonstration | ✅ | — | — |
|
||||
|
||||
## Status Legend
|
||||
|
||||
| Symbol | Meaning |
|
||||
|--------|---------|
|
||||
| ✅ | Works out-of-the-box |
|
||||
| — | Not applicable or requires specific setup |
|
||||
|
||||
## By Category
|
||||
|
||||
### Data Services
|
||||
- **`postgresql`** — Deploy a PostgreSQL cluster via CloudNativePG
|
||||
- **`multisite_postgres`** — Multi-site PostgreSQL with failover
|
||||
- **`public_postgres`** — Public-facing PostgreSQL (⚠️ uses NationTech DNS)
|
||||
|
||||
### Kubernetes Utilities
|
||||
- **`node_health`** — Check node health in a cluster
|
||||
- **`k8s_drain_node`** — Drain and reboot a node
|
||||
- **`k8s_write_file_on_node`** — Write files to nodes
|
||||
- **`validate_ceph_cluster_health`** — Verify Ceph/Rook cluster health
|
||||
- **`remove_rook_osd`** — Remove an OSD from Rook/Ceph
|
||||
- **`kube-rs`** — Direct Kubernetes client usage demo
|
||||
|
||||
### Monitoring & Alerting
|
||||
- **`monitoring`** — Deploy Prometheus alerting with Discord webhooks
|
||||
- **`monitoring_with_tenant`** — Monitoring with tenant isolation
|
||||
- **`ntfy`** — Deploy ntfy notification server
|
||||
- **`okd_cluster_alerts`** — OKD-specific cluster alerts
|
||||
|
||||
### Application Deployment
|
||||
- **`try_rust_webapp`** — Deploy a Rust webapp with packaging (⚠️ requires `tryrust.org` submodule)
|
||||
- **`rust`** — Rust webapp with full monitoring features
|
||||
- **`rhob_application_monitoring`** — Red Hat Observability Stack monitoring
|
||||
- **`lamp`** — LAMP stack deployment (⚠️ uses NationTech DNS)
|
||||
- **`application_monitoring_with_tenant`** — App monitoring with tenant isolation
|
||||
|
||||
### Infrastructure & Bare Metal
|
||||
- **`okd_installation`** — Full OKD cluster from scratch
|
||||
- **`okd_pxe`** — PXE boot configuration for OKD
|
||||
- **`sttest`** — Full OKD stack test with specific hardware
|
||||
- **`brocade_snmp_server`** — Configure Brocade switch via SNMP
|
||||
- **`opnsense_node_exporter`** — Node exporter on OPNsense firewall
|
||||
|
||||
### Multi-Cluster
|
||||
- **`nats`** — NATS deployment on a cluster
|
||||
- **`nats-supercluster`** — NATS supercluster across multiple sites
|
||||
- **`multisite_postgres`** — PostgreSQL with multi-site failover
|
||||
|
||||
### Identity & Secrets
|
||||
- **`openbao`** — Deploy OpenBao vault (⚠️ uses NationTech DNS)
|
||||
- **`zitadel`** — Deploy Zitadel identity provider (⚠️ uses NationTech DNS)
|
||||
|
||||
### Cluster Services
|
||||
- **`cert_manager`** — Provision TLS certificates
|
||||
- **`tenant`** — Create a multi-tenant namespace
|
||||
- **`operatorhub_catalog`** — Install OperatorHub catalog sources
|
||||
|
||||
### Development & Testing
|
||||
- **`cli`** — CLI tool with inventory discovery
|
||||
- **`tui`** — Terminal UI demonstration
|
||||
- **`harmony_inventory_builder`** — Host discovery via subnet scan
|
||||
|
||||
## Running Examples
|
||||
|
||||
```bash
|
||||
# Build first
|
||||
cargo build --release
|
||||
|
||||
# Run any example
|
||||
cargo run -p example-postgresql
|
||||
cargo run -p example-ntfy
|
||||
cargo run -p example-tenant
|
||||
```
|
||||
|
||||
For examples that need an existing Kubernetes cluster:
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/path/to/your/kubeconfig
|
||||
export HARMONY_USE_LOCAL_K3D=false
|
||||
export HARMONY_AUTOINSTALL=false
|
||||
|
||||
cargo run -p example-monitoring
|
||||
```
|
||||
|
||||
## Notes on Private Infrastructure
|
||||
|
||||
Some examples use NationTech-hosted infrastructure by default (DNS domains like `*.nationtech.io`, `*.harmony.mcd`). These are not suitable for public use without modification. See the [Getting Started Guide](../docs/guides/getting-started.md) for the recommended public examples.
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use brocade::{BrocadeOptions, PortOperatingMode};
|
||||
use brocade::{BrocadeOptions, InterfaceConfig, InterfaceType, PortOperatingMode, SwitchInterface, VlanList};
|
||||
use harmony::{
|
||||
infra::brocade::BrocadeSwitchConfig,
|
||||
inventory::Inventory,
|
||||
@@ -9,6 +9,13 @@ use harmony::{
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::{id::Id, switch::PortLocation};
|
||||
|
||||
fn tengig(stack: u8, slot: u8, port: u8) -> SwitchInterface {
|
||||
SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(stack, slot, port),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_switch_config() -> BrocadeSwitchConfig {
|
||||
let mut options = BrocadeOptions::default();
|
||||
options.ssh.port = 2222;
|
||||
@@ -33,9 +40,27 @@ async fn main() {
|
||||
Id::from_str("18").unwrap(),
|
||||
],
|
||||
ports_to_configure: vec![
|
||||
(PortLocation(2, 0, 17), PortOperatingMode::Trunk),
|
||||
(PortLocation(2, 0, 19), PortOperatingMode::Trunk),
|
||||
(PortLocation(1, 0, 18), PortOperatingMode::Trunk),
|
||||
InterfaceConfig {
|
||||
interface: tengig(2, 0, 17),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: None,
|
||||
},
|
||||
InterfaceConfig {
|
||||
interface: tengig(2, 0, 19),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: None,
|
||||
},
|
||||
InterfaceConfig {
|
||||
interface: tengig(1, 0, 18),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: None,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
|
||||
18
examples/brocade_switch_configuration/Cargo.toml
Normal file
18
examples/brocade_switch_configuration/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "brocade-switch-configuration"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
tokio.workspace = true
|
||||
async-trait.workspace = true
|
||||
serde.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
brocade = { path = "../../brocade" }
|
||||
4
examples/brocade_switch_configuration/env.sh
Normal file
4
examples/brocade_switch_configuration/env.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
export HARMONY_SECRET_NAMESPACE=brocade-example
|
||||
export HARMONY_SECRET_STORE=file
|
||||
export HARMONY_DATABASE_URL=sqlite://harmony_brocade_example.sqlite
|
||||
export RUST_LOG=info
|
||||
143
examples/brocade_switch_configuration/src/main.rs
Normal file
143
examples/brocade_switch_configuration/src/main.rs
Normal file
@@ -0,0 +1,143 @@
|
||||
use brocade::{
|
||||
BrocadeOptions, InterfaceConfig, InterfaceSpeed, InterfaceType, PortChannelConfig,
|
||||
PortOperatingMode, SwitchInterface, Vlan, VlanList,
|
||||
};
|
||||
use harmony::{
|
||||
infra::brocade::BrocadeSwitchConfig,
|
||||
inventory::Inventory,
|
||||
modules::brocade::{BrocadeSwitchAuth, BrocadeSwitchConfigurationScore, SwitchTopology},
|
||||
};
|
||||
use harmony_macros::ip;
|
||||
use harmony_types::switch::PortLocation;
|
||||
|
||||
fn tengig(stack: u8, slot: u8, port: u8) -> SwitchInterface {
|
||||
SwitchInterface::Ethernet(
|
||||
InterfaceType::TenGigabitEthernet,
|
||||
PortLocation(stack, slot, port),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_switch_config() -> BrocadeSwitchConfig {
|
||||
let auth = BrocadeSwitchAuth {
|
||||
username: "admin".to_string(),
|
||||
password: "password".to_string(),
|
||||
};
|
||||
|
||||
BrocadeSwitchConfig {
|
||||
ips: vec![ip!("192.168.12.147"), ip!("192.168.12.109")],
|
||||
auth,
|
||||
options: BrocadeOptions {
|
||||
dry_run: false,
|
||||
ssh: brocade::ssh::SshOptions {
|
||||
port: 22,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
harmony_cli::cli_logger::init();
|
||||
|
||||
// ===================================================
|
||||
// Step 1: Define VLANs once, use them everywhere
|
||||
// ===================================================
|
||||
let mgmt = Vlan {
|
||||
id: 100,
|
||||
name: "MGMT".to_string(),
|
||||
};
|
||||
let data = Vlan {
|
||||
id: 200,
|
||||
name: "DATA".to_string(),
|
||||
};
|
||||
let storage = Vlan {
|
||||
id: 300,
|
||||
name: "STORAGE".to_string(),
|
||||
};
|
||||
let backup = Vlan {
|
||||
id: 400,
|
||||
name: "BACKUP".to_string(),
|
||||
};
|
||||
|
||||
// ===================================================
|
||||
// Step 2: Build the score
|
||||
// ===================================================
|
||||
let score = BrocadeSwitchConfigurationScore {
|
||||
// All VLANs that need to exist on the switch
|
||||
vlans: vec![mgmt.clone(), data.clone(), storage.clone(), backup.clone()],
|
||||
|
||||
// Standalone interfaces (not part of any port-channel)
|
||||
interfaces: vec![
|
||||
// Trunk port with ALL VLANs, forced to 10Gbps
|
||||
InterfaceConfig {
|
||||
interface: tengig(1, 0, 1),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: Some(InterfaceSpeed::Gbps10),
|
||||
},
|
||||
// Trunk port with specific VLANs (MGMT + DATA only)
|
||||
InterfaceConfig {
|
||||
interface: tengig(1, 0, 2),
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::Specific(vec![mgmt.clone(), data.clone()])),
|
||||
speed: None,
|
||||
},
|
||||
// Access port on the MGMT VLAN
|
||||
InterfaceConfig {
|
||||
interface: tengig(1, 0, 3),
|
||||
mode: PortOperatingMode::Access,
|
||||
access_vlan: Some(mgmt.id),
|
||||
trunk_vlans: None,
|
||||
speed: None,
|
||||
},
|
||||
// Access port on the STORAGE VLAN
|
||||
InterfaceConfig {
|
||||
interface: tengig(1, 0, 4),
|
||||
mode: PortOperatingMode::Access,
|
||||
access_vlan: Some(storage.id),
|
||||
trunk_vlans: None,
|
||||
speed: None,
|
||||
},
|
||||
],
|
||||
|
||||
// Port-channels: member ports are bundled, L2 config goes on the port-channel
|
||||
port_channels: vec![
|
||||
// Port-channel 1: trunk with DATA + STORAGE VLANs, forced to 1Gbps
|
||||
PortChannelConfig {
|
||||
id: 1,
|
||||
name: "SERVER_BOND".to_string(),
|
||||
ports: vec![PortLocation(1, 0, 5), PortLocation(1, 0, 6)],
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::Specific(vec![data.clone(), storage.clone()])),
|
||||
speed: Some(InterfaceSpeed::Gbps1),
|
||||
},
|
||||
// Port-channel 2: trunk with all VLANs, default speed
|
||||
PortChannelConfig {
|
||||
id: 2,
|
||||
name: "BACKUP_BOND".to_string(),
|
||||
ports: vec![PortLocation(1, 0, 7), PortLocation(1, 0, 8)],
|
||||
mode: PortOperatingMode::Trunk,
|
||||
access_vlan: None,
|
||||
trunk_vlans: Some(VlanList::All),
|
||||
speed: None,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
// ===================================================
|
||||
// Step 3: Run
|
||||
// ===================================================
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
SwitchTopology::new(get_switch_config()).await,
|
||||
vec![Box::new(score)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::cert_manager::{
|
||||
capability::CertificateManagementConfig, score_cert_management::CertificateManagementScore,
|
||||
score_certificate::CertificateScore, score_issuer::CertificateIssuerScore,
|
||||
capability::CertificateManagementConfig, score_certificate::CertificateScore,
|
||||
score_issuer::CertificateIssuerScore,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
15
examples/example_linux_vm/Cargo.toml
Normal file
15
examples/example_linux_vm/Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "example_linux_vm"
|
||||
version.workspace = true
|
||||
edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "example_linux_vm"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
tokio.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
43
examples/example_linux_vm/README.md
Normal file
43
examples/example_linux_vm/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Example: Linux VM from ISO
|
||||
|
||||
This example deploys a simple Linux virtual machine from an ISO URL.
|
||||
|
||||
## What it creates
|
||||
|
||||
- One isolated virtual network (`linuxvm-net`, 192.168.101.0/24)
|
||||
- One Ubuntu Server VM with the ISO attached as a CD-ROM
|
||||
- The VM is configured to boot from the CD-ROM first, allowing installation
|
||||
- After installation, the VM can be rebooted to boot from disk
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A running KVM hypervisor (local or remote)
|
||||
- `HARMONY_KVM_URI` environment variable pointing to the hypervisor (defaults to `qemu:///system`)
|
||||
- `HARMONY_KVM_IMAGE_DIR` environment variable for storing VM images (defaults to harmony data dir)
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
cargo run -p example_linux_vm
|
||||
```
|
||||
|
||||
## After deployment
|
||||
|
||||
Once the VM is running, you can connect to its console:
|
||||
|
||||
```bash
|
||||
virsh -c qemu:///system console linux-vm
|
||||
```
|
||||
|
||||
To access the VM via SSH after installation, you'll need to configure a bridged network or port forwarding.
|
||||
|
||||
## Clean up
|
||||
|
||||
To remove the VM and network:
|
||||
|
||||
```bash
|
||||
virsh -c qemu:///system destroy linux-vm
|
||||
virsh -c qemu:///system undefine linux-vm
|
||||
virsh -c qemu:///system net-destroy linuxvm-net
|
||||
virsh -c qemu:///system net-undefine linuxvm-net
|
||||
```
|
||||
63
examples/example_linux_vm/src/main.rs
Normal file
63
examples/example_linux_vm/src/main.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use harmony::modules::kvm::config::init_executor;
|
||||
use harmony::modules::kvm::{BootDevice, NetworkConfig, NetworkRef, VmConfig};
|
||||
use log::info;
|
||||
|
||||
const NETWORK_NAME: &str = "linuxvm-net";
|
||||
const NETWORK_GATEWAY: &str = "192.168.101.1";
|
||||
const NETWORK_PREFIX: u8 = 24;
|
||||
|
||||
const UBUNTU_ISO_URL: &str =
|
||||
"https://releases.ubuntu.com/24.04/ubuntu-24.04.3-live-server-amd64.iso";
|
||||
|
||||
pub async fn deploy_linux_vm() -> Result<(), String> {
|
||||
let executor = init_executor().map_err(|e| format!("KVM initialization failed: {e}"))?;
|
||||
|
||||
let network = NetworkConfig::builder(NETWORK_NAME)
|
||||
.bridge("virbr101")
|
||||
.subnet(NETWORK_GATEWAY, NETWORK_PREFIX)
|
||||
.build();
|
||||
|
||||
info!("Ensuring network '{NETWORK_NAME}' ({NETWORK_GATEWAY}/{NETWORK_PREFIX}) exists");
|
||||
executor
|
||||
.ensure_network(network)
|
||||
.await
|
||||
.map_err(|e| format!("Network setup failed: {e}"))?;
|
||||
|
||||
let vm = linux_vm();
|
||||
info!("Defining Linux VM '{}'", vm.name);
|
||||
executor
|
||||
.ensure_vm(vm.clone())
|
||||
.await
|
||||
.map_err(|e| format!("Linux VM setup failed: {e}"))?;
|
||||
|
||||
info!("Starting VM '{}'", vm.name);
|
||||
executor
|
||||
.start_vm(&vm.name)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to start VM: {e}"))?;
|
||||
|
||||
info!(
|
||||
"Linux VM '{}' is running. \
|
||||
Connect to the console using: virsh -c qemu:///system console {}",
|
||||
vm.name, vm.name
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn linux_vm() -> VmConfig {
|
||||
VmConfig::builder("linux-vm")
|
||||
.vcpus(2)
|
||||
.memory_gb(4)
|
||||
.disk(20)
|
||||
.network(NetworkRef::named(NETWORK_NAME))
|
||||
.cdrom(UBUNTU_ISO_URL)
|
||||
.boot_order([BootDevice::Cdrom, BootDevice::Disk])
|
||||
.build()
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), String> {
|
||||
env_logger::init();
|
||||
deploy_linux_vm().await
|
||||
}
|
||||
25
examples/harmony_sso/Cargo.toml
Normal file
25
examples/harmony_sso/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "example-harmony-sso"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_config = { path = "../../harmony_config" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony_secret = { path = "../../harmony_secret" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
k3d-rs = { path = "../../k3d" }
|
||||
kube.workspace = true
|
||||
tokio.workspace = true
|
||||
url.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
anyhow.workspace = true
|
||||
reqwest.workspace = true
|
||||
directories = "6.0.0"
|
||||
395
examples/harmony_sso/src/main.rs
Normal file
395
examples/harmony_sso/src/main.rs
Normal file
@@ -0,0 +1,395 @@
|
||||
use anyhow::Context;
|
||||
use harmony::inventory::Inventory;
|
||||
use harmony::modules::openbao::OpenbaoScore;
|
||||
use harmony::score::Score;
|
||||
use harmony::topology::Topology;
|
||||
use k3d_rs::{K3d, PortMapping};
|
||||
use log::info;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
const CLUSTER_NAME: &str = "harmony-example";
|
||||
const ZITADEL_HOST: &str = "sso.harmony.local";
|
||||
const OPENBAO_HOST: &str = "bao.harmony.local";
|
||||
|
||||
const ZITADEL_PORT: u32 = 8080;
|
||||
const OPENBAO_PORT: u32 = 8200;
|
||||
|
||||
fn get_k3d_binary_path() -> PathBuf {
|
||||
directories::BaseDirs::new()
|
||||
.map(|dirs| dirs.data_dir().join("harmony").join("k3d"))
|
||||
.unwrap_or_else(|| PathBuf::from("/tmp/harmony-k3d"))
|
||||
}
|
||||
|
||||
fn get_openbao_data_path() -> PathBuf {
|
||||
directories::BaseDirs::new()
|
||||
.map(|dirs| dirs.data_dir().join("harmony").join("openbao"))
|
||||
.unwrap_or_else(|| PathBuf::from("/tmp/harmony-openbao"))
|
||||
}
|
||||
|
||||
async fn ensure_k3d_cluster() -> anyhow::Result<()> {
|
||||
let base_dir = get_k3d_binary_path();
|
||||
std::fs::create_dir_all(&base_dir).context("Failed to create k3d data directory")?;
|
||||
|
||||
info!(
|
||||
"Ensuring k3d cluster '{}' is running with port mappings",
|
||||
CLUSTER_NAME
|
||||
);
|
||||
|
||||
let k3d = K3d::new(base_dir.clone(), Some(CLUSTER_NAME.to_string())).with_port_mappings(vec![
|
||||
PortMapping::new(ZITADEL_PORT, 80),
|
||||
PortMapping::new(OPENBAO_PORT, 8200),
|
||||
]);
|
||||
|
||||
k3d.ensure_installed()
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to ensure k3d installed: {}", e))?;
|
||||
|
||||
info!("k3d cluster '{}' is ready", CLUSTER_NAME);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_topology() -> harmony::topology::K8sAnywhereTopology {
|
||||
unsafe {
|
||||
std::env::set_var("HARMONY_USE_LOCAL_K3D", "false");
|
||||
std::env::set_var("HARMONY_AUTOINSTALL", "false");
|
||||
std::env::set_var("HARMONY_K8S_CONTEXT", "k3d-harmony-example");
|
||||
}
|
||||
harmony::topology::K8sAnywhereTopology::from_env()
|
||||
}
|
||||
|
||||
async fn cleanup_openbao_webhook() -> anyhow::Result<()> {
|
||||
let output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"get",
|
||||
"mutatingwebhookconfigurations",
|
||||
])
|
||||
.output()
|
||||
.context("Failed to check webhooks")?;
|
||||
|
||||
if String::from_utf8_lossy(&output.stdout).contains("openbao-agent-injector-cfg") {
|
||||
info!("Deleting conflicting OpenBao webhook...");
|
||||
let _ = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"delete",
|
||||
"mutatingwebhookconfiguration",
|
||||
"openbao-agent-injector-cfg",
|
||||
"--ignore-not-found=true",
|
||||
])
|
||||
.output();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn deploy_openbao(topology: &harmony::topology::K8sAnywhereTopology) -> anyhow::Result<()> {
|
||||
info!("Deploying OpenBao...");
|
||||
|
||||
let openbao = OpenbaoScore {
|
||||
host: OPENBAO_HOST.to_string(),
|
||||
openshift: false,
|
||||
};
|
||||
|
||||
let inventory = Inventory::autoload();
|
||||
openbao
|
||||
.interpret(&inventory, topology)
|
||||
.await
|
||||
.context("OpenBao deployment failed")?;
|
||||
|
||||
info!("OpenBao deployed successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn wait_for_openbao_running() -> anyhow::Result<()> {
|
||||
info!("Waiting for OpenBao pods to be running...");
|
||||
|
||||
let output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"wait",
|
||||
"-n",
|
||||
"openbao",
|
||||
"--for=condition=podinitialized",
|
||||
"pod/openbao-0",
|
||||
"--timeout=120s",
|
||||
])
|
||||
.output()
|
||||
.context("Failed to wait for OpenBao pod")?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
info!(
|
||||
"Pod initialized wait failed, trying alternative approach: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||
|
||||
info!("OpenBao pod is running (may be sealed)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct OpenBaoInitOutput {
|
||||
#[serde(rename = "unseal_keys_b64")]
|
||||
keys: Vec<String>,
|
||||
#[serde(rename = "root_token")]
|
||||
root_token: String,
|
||||
}
|
||||
|
||||
async fn init_openbao() -> anyhow::Result<String> {
|
||||
let data_path = get_openbao_data_path();
|
||||
std::fs::create_dir_all(&data_path).context("Failed to create openbao data directory")?;
|
||||
|
||||
let keys_file = data_path.join("unseal-keys.json");
|
||||
|
||||
if keys_file.exists() {
|
||||
info!("OpenBao already initialized, loading existing keys");
|
||||
let content = std::fs::read_to_string(&keys_file)?;
|
||||
let init_output: OpenBaoInitOutput = serde_json::from_str(&content)?;
|
||||
return Ok(init_output.root_token);
|
||||
}
|
||||
|
||||
info!("Initializing OpenBao...");
|
||||
|
||||
let output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"exec",
|
||||
"-n",
|
||||
"openbao",
|
||||
"openbao-0",
|
||||
"--",
|
||||
"bao",
|
||||
"operator",
|
||||
"init",
|
||||
"-format=json",
|
||||
])
|
||||
.output()
|
||||
.context("Failed to initialize OpenBao")?;
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
if stderr.contains("already initialized") {
|
||||
info!("OpenBao is already initialized");
|
||||
return Err(anyhow::anyhow!(
|
||||
"OpenBao is already initialized but no keys file found. \
|
||||
Please delete the cluster and try again: k3d cluster delete harmony-example"
|
||||
));
|
||||
}
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"OpenBao init failed with status {}: {}",
|
||||
output.status,
|
||||
stderr
|
||||
));
|
||||
}
|
||||
|
||||
if stdout.trim().is_empty() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"OpenBao init returned empty output. stderr: {}",
|
||||
stderr
|
||||
));
|
||||
}
|
||||
|
||||
let init_output: OpenBaoInitOutput = serde_json::from_str(&stdout)?;
|
||||
|
||||
std::fs::write(&keys_file, serde_json::to_string_pretty(&init_output)?)?;
|
||||
|
||||
info!("OpenBao initialized successfully");
|
||||
info!("Unseal keys saved to {:?}", keys_file);
|
||||
|
||||
Ok(init_output.root_token)
|
||||
}
|
||||
|
||||
async fn unseal_openbao(root_token: &str) -> anyhow::Result<()> {
|
||||
info!("Unsealing OpenBao...");
|
||||
|
||||
let status_output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"exec",
|
||||
"-n",
|
||||
"openbao",
|
||||
"openbao-0",
|
||||
"--",
|
||||
"bao",
|
||||
"status",
|
||||
"-format=json",
|
||||
])
|
||||
.output()
|
||||
.context("Failed to get OpenBao status")?;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct StatusOutput {
|
||||
sealed: bool,
|
||||
}
|
||||
|
||||
if status_output.status.success() {
|
||||
if let Ok(status) =
|
||||
serde_json::from_str::<StatusOutput>(&String::from_utf8_lossy(&status_output.stdout))
|
||||
{
|
||||
if !status.sealed {
|
||||
info!("OpenBao is already unsealed");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data_path = get_openbao_data_path();
|
||||
let keys_file = data_path.join("unseal-keys.json");
|
||||
|
||||
let content = std::fs::read_to_string(&keys_file)?;
|
||||
let init_output: OpenBaoInitOutput = serde_json::from_str(&content)?;
|
||||
|
||||
for key in &init_output.keys[0..3] {
|
||||
let output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"exec",
|
||||
"-n",
|
||||
"openbao",
|
||||
"openbao-0",
|
||||
"--",
|
||||
"bao",
|
||||
"operator",
|
||||
"unseal",
|
||||
key,
|
||||
])
|
||||
.output()
|
||||
.context("Failed to unseal OpenBao")?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Unseal failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
info!("OpenBao unsealed successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_bao_command(root_token: &str, args: &[&str]) -> anyhow::Result<String> {
|
||||
let command = args.join(" ");
|
||||
let shell_command = format!("VAULT_TOKEN={} {}", root_token, command);
|
||||
|
||||
let output = Command::new("kubectl")
|
||||
.args([
|
||||
"--context",
|
||||
"k3d-harmony-example",
|
||||
"exec",
|
||||
"-n",
|
||||
"openbao",
|
||||
"openbao-0",
|
||||
"--",
|
||||
"sh",
|
||||
"-c",
|
||||
&shell_command,
|
||||
])
|
||||
.output()
|
||||
.context("Failed to run bao command")?;
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("bao command failed: {}", stderr));
|
||||
}
|
||||
|
||||
Ok(stdout.to_string())
|
||||
}
|
||||
|
||||
async fn configure_openbao_admin_user(root_token: &str) -> anyhow::Result<()> {
|
||||
info!("Configuring OpenBao with userpass auth...");
|
||||
|
||||
let _ = run_bao_command(root_token, &["bao", "auth", "enable", "userpass"]).await;
|
||||
let _ = run_bao_command(
|
||||
root_token,
|
||||
&["bao", "secrets", "enable", "-path=secret", "kv-v2"],
|
||||
)
|
||||
.await;
|
||||
|
||||
run_bao_command(
|
||||
root_token,
|
||||
&[
|
||||
"bao",
|
||||
"write",
|
||||
"auth/userpass/users/harmony",
|
||||
"password=harmony-dev-password",
|
||||
"policies=default",
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!("OpenBao configured with userpass auth");
|
||||
info!(" Username: harmony");
|
||||
info!(" Password: harmony-dev-password");
|
||||
info!(" Root token: {}", root_token);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||
|
||||
info!("===========================================");
|
||||
info!("Harmony SSO Example");
|
||||
info!("Deploys Zitadel + OpenBao on k3d");
|
||||
info!("===========================================");
|
||||
|
||||
ensure_k3d_cluster().await?;
|
||||
|
||||
info!("===========================================");
|
||||
info!("Cluster '{}' is ready", CLUSTER_NAME);
|
||||
info!(
|
||||
"Zitadel will be available at: http://{}:{}",
|
||||
ZITADEL_HOST, ZITADEL_PORT
|
||||
);
|
||||
info!(
|
||||
"OpenBao will be available at: http://{}:{}",
|
||||
OPENBAO_HOST, OPENBAO_PORT
|
||||
);
|
||||
info!("===========================================");
|
||||
|
||||
let topology = create_topology();
|
||||
topology
|
||||
.ensure_ready()
|
||||
.await
|
||||
.context("Failed to initialize topology")?;
|
||||
|
||||
cleanup_openbao_webhook().await?;
|
||||
deploy_openbao(&topology).await?;
|
||||
wait_for_openbao_running().await?;
|
||||
|
||||
let root_token = init_openbao().await?;
|
||||
unseal_openbao(&root_token).await?;
|
||||
configure_openbao_admin_user(&root_token).await?;
|
||||
|
||||
info!("===========================================");
|
||||
info!("OpenBao initialized and configured!");
|
||||
info!("===========================================");
|
||||
info!("Zitadel: http://{}:{}", ZITADEL_HOST, ZITADEL_PORT);
|
||||
info!("OpenBao: http://{}:{}", OPENBAO_HOST, OPENBAO_PORT);
|
||||
info!("===========================================");
|
||||
info!("OpenBao credentials:");
|
||||
info!(" Username: harmony");
|
||||
info!(" Password: harmony-dev-password");
|
||||
info!("===========================================");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -10,9 +10,10 @@ publish = false
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
harmony-k8s = { path = "../../harmony-k8s" }
|
||||
cidr.workspace = true
|
||||
tokio.workspace = true
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use harmony::topology::k8s::{DrainOptions, K8sClient};
|
||||
use harmony_k8s::{DrainOptions, K8sClient};
|
||||
use log::{info, trace};
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user