feat/harmony_agent #220

Merged
wjro merged 21 commits from feat/harmony_agent into master 2026-02-04 21:05:35 +00:00
40 changed files with 4744 additions and 125 deletions

View File

@@ -1,2 +1,6 @@
target/ target/
Dockerfile Dockerfile
.git
data
target
demos

2
.gitignore vendored
View File

@@ -24,3 +24,5 @@ Cargo.lock
# MSVC Windows builds of rustc generate these, which store debugging information # MSVC Windows builds of rustc generate these, which store debugging information
*.pdb *.pdb
.harmony_generated

218
Cargo.lock generated
View File

@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"const-random", "const-random",
"getrandom 0.3.3", "getrandom 0.3.4",
"once_cell", "once_cell",
"version_check", "version_check",
"zerocopy", "zerocopy",
@@ -450,6 +450,43 @@ dependencies = [
"pin-project-lite", "pin-project-lite",
] ]
[[package]]
name = "async-nats"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-util",
"memchr",
"nkeys",
"nuid",
"once_cell",
"pin-project",
"portable-atomic",
"rand 0.8.5",
"regex",
"ring",
"rustls-native-certs 0.7.3",
"rustls-pemfile 2.2.0",
"rustls-webpki 0.102.8",
"serde",
"serde_json",
"serde_nanos",
"serde_repr",
"thiserror 1.0.69",
"time",
"tokio",
"tokio-rustls 0.26.2",
"tokio-stream",
"tokio-util",
"tokio-websockets",
"tracing",
"tryhard",
"url",
]
[[package]] [[package]]
name = "async-stream" name = "async-stream"
version = "0.3.6" version = "0.3.6"
@@ -775,6 +812,9 @@ name = "bytes"
version = "1.10.1" version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "bytestring" name = "bytestring"
@@ -1583,6 +1623,7 @@ dependencies = [
"rand_core 0.6.4", "rand_core 0.6.4",
"serde", "serde",
"sha2", "sha2",
"signature",
"subtle", "subtle",
"zeroize", "zeroize",
] ]
@@ -2456,21 +2497,21 @@ dependencies = [
"cfg-if", "cfg-if",
"js-sys", "js-sys",
"libc", "libc",
"wasi 0.11.1+wasi-snapshot-preview1", "wasi",
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.3.3" version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"js-sys", "js-sys",
"libc", "libc",
"r-efi", "r-efi",
"wasi 0.14.3+wasi-0.2.4", "wasip2",
"wasm-bindgen", "wasm-bindgen",
] ]
@@ -2572,6 +2613,7 @@ dependencies = [
"env_logger", "env_logger",
"fqdn", "fqdn",
"futures-util", "futures-util",
"harmony_execution",
"harmony_inventory_agent", "harmony_inventory_agent",
"harmony_macros", "harmony_macros",
"harmony_secret", "harmony_secret",
@@ -2619,6 +2661,43 @@ dependencies = [
"walkdir", "walkdir",
] ]
[[package]]
name = "harmony_agent"
version = "0.1.0"
dependencies = [
"async-nats",
"async-trait",
"cidr",
"env_logger",
"getrandom 0.3.4",
"harmony",
"harmony_macros",
"harmony_types",
"log",
"pretty_assertions",
"serde",
"serde_json",
"thiserror 2.0.16",
"tokio",
]
[[package]]
name = "harmony_agent_deploy"
version = "0.1.0"
dependencies = [
"cidr",
"env_logger",
"harmony",
"harmony_cli",
"harmony_macros",
"harmony_types",
"log",
"serde",
"serde_json",
"tokio",
"url",
]
[[package]] [[package]]
name = "harmony_cli" name = "harmony_cli"
version = "0.1.0" version = "0.1.0"
@@ -2659,6 +2738,16 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "harmony_execution"
version = "0.1.0"
dependencies = [
"directories",
"lazy_static",
"log",
"thiserror 2.0.16",
]
[[package]] [[package]]
name = "harmony_inventory_agent" name = "harmony_inventory_agent"
version = "0.1.0" version = "0.1.0"
@@ -3523,7 +3612,7 @@ version = "0.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
dependencies = [ dependencies = [
"getrandom 0.3.3", "getrandom 0.3.4",
"libc", "libc",
] ]
@@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
dependencies = [ dependencies = [
"libc", "libc",
"log", "log",
"wasi 0.11.1+wasi-snapshot-preview1", "wasi",
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
@@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
dependencies = [ dependencies = [
"libc", "libc",
"log", "log",
"wasi 0.11.1+wasi-snapshot-preview1", "wasi",
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
@@ -4022,6 +4111,21 @@ dependencies = [
"unicode-segmentation", "unicode-segmentation",
] ]
[[package]]
name = "nkeys"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
dependencies = [
"data-encoding",
"ed25519",
"ed25519-dalek",
"getrandom 0.2.16",
"log",
"rand 0.8.5",
"signatory",
]
[[package]] [[package]]
name = "non-blank-string-rs" name = "non-blank-string-rs"
version = "1.0.4" version = "1.0.4"
@@ -4040,6 +4144,15 @@ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "nuid"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
dependencies = [
"rand 0.8.5",
]
[[package]] [[package]]
name = "num-bigint" name = "num-bigint"
version = "0.4.6" version = "0.4.6"
@@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
dependencies = [ dependencies = [
"bytes", "bytes",
"getrandom 0.3.3", "getrandom 0.3.4",
"lru-slab", "lru-slab",
"rand 0.9.2", "rand 0.9.2",
"ring", "ring",
@@ -4765,7 +4878,7 @@ version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [ dependencies = [
"getrandom 0.3.3", "getrandom 0.3.4",
] ]
[[package]] [[package]]
@@ -5301,6 +5414,16 @@ dependencies = [
"untrusted", "untrusted",
] ]
[[package]]
name = "rustls-webpki"
version = "0.102.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
dependencies = [
"rustls-pki-types",
"untrusted",
]
[[package]] [[package]]
name = "rustls-webpki" name = "rustls-webpki"
version = "0.103.4" version = "0.103.4"
@@ -5564,6 +5687,15 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "serde_nanos"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "serde_path_to_error" name = "serde_path_to_error"
version = "0.1.17" version = "0.1.17"
@@ -5731,6 +5863,18 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "signatory"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
dependencies = [
"pkcs8",
"rand_core 0.6.4",
"signature",
"zeroize",
]
[[package]] [[package]]
name = "signature" name = "signature"
version = "2.2.0" version = "2.2.0"
@@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
dependencies = [ dependencies = [
"fastrand", "fastrand",
"getrandom 0.3.3", "getrandom 0.3.4",
"once_cell", "once_cell",
"rustix 1.0.8", "rustix 1.0.8",
"windows-sys 0.60.2", "windows-sys 0.60.2",
@@ -6538,6 +6682,27 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "tokio-websockets"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-core",
"futures-sink",
"http 1.3.1",
"httparse",
"rand 0.8.5",
"ring",
"rustls-pki-types",
"tokio",
"tokio-rustls 0.26.2",
"tokio-util",
"webpki-roots 0.26.11",
]
[[package]] [[package]]
name = "toml" name = "toml"
version = "0.8.23" version = "0.8.23"
@@ -6689,6 +6854,16 @@ version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
[[package]]
name = "tryhard"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
dependencies = [
"pin-project-lite",
"tokio",
]
[[package]] [[package]]
name = "tui-logger" name = "tui-logger"
version = "0.14.5" version = "0.14.5"
@@ -6865,7 +7040,7 @@ version = "1.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
dependencies = [ dependencies = [
"getrandom 0.3.3", "getrandom 0.3.4",
"js-sys", "js-sys",
"rand 0.9.2", "rand 0.9.2",
"uuid-macro-internal", "uuid-macro-internal",
@@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]] [[package]]
name = "wasi" name = "wasip2"
version = "0.14.3+wasi-0.2.4" version = "1.0.2+wasi-0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95" checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
dependencies = [ dependencies = [
"wit-bindgen", "wit-bindgen",
] ]
@@ -7061,6 +7236,15 @@ version = "0.25.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
[[package]]
name = "webpki-roots"
version = "0.26.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
dependencies = [
"webpki-roots 1.0.2",
]
[[package]] [[package]]
name = "webpki-roots" name = "webpki-roots"
version = "1.0.2" version = "1.0.2"
@@ -7438,9 +7622,9 @@ dependencies = [
[[package]] [[package]]
name = "wit-bindgen" name = "wit-bindgen"
version = "0.45.0" version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
[[package]] [[package]]
name = "writeable" name = "writeable"

View File

@@ -7,6 +7,7 @@ members = [
"harmony_types", "harmony_types",
"harmony_macros", "harmony_macros",
"harmony_tui", "harmony_tui",
"harmony_execution",
"opnsense-config", "opnsense-config",
"opnsense-config-xml", "opnsense-config-xml",
"harmony_cli", "harmony_cli",
@@ -17,6 +18,8 @@ members = [
"harmony_secret", "harmony_secret",
"adr/agent_discovery/mdns", "adr/agent_discovery/mdns",
"brocade", "brocade",
"harmony_agent",
"harmony_agent/deploy",
] ]
[workspace.package] [workspace.package]

View File

@@ -1,5 +1,7 @@
# Harmony : Open-source infrastructure orchestration that treats your platform like first-class code # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
In other words, Harmony is a **next-generation platform engineering framework**.
_By [NationTech](https://nationtech.io)_ _By [NationTech](https://nationtech.io)_
[![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony) [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony)

View File

@@ -0,0 +1,141 @@
# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
Initial Date: 2025-01-23
Last Updated Date: 2025-01-23
## Status
Implemented
## Context
Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
* **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
* **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
* **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
* **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
## Decision
We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
Specifically:
* **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
* **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
* **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
* **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
The implementation in `backend_app.rs` demonstrates this pattern:
```rust
let deployment = Deployment {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
..Default::default()
},
spec: Some(DeploymentSpec { /* ... */ }),
..Default::default()
};
let deployment_yaml = serde_yaml::to_string(&deployment)?;
fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
```
## Rationale
**Aligns with "Infrastructure as Resilient Code"**
Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
* **Refactorability:** Rename a label and the compiler catches all usages.
* **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
* **Code Navigation:** Jump to definition shows exactly where a value comes from.
**Achieves "Prove It Works — Before You Deploy"**
The compiler now validates that:
* All required fields are populated (Rust's `Option` type prevents missing fields).
* Field types match expectations (ports are integers, not strings).
* Enums contain valid values (e.g., `ServiceType::ClusterIP`).
This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
**Enables True Unit Testing**
Developers can now write unit tests that assert directly against typed objects:
```rust
let deployment = create_deployment(&app);
assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
assert_eq!(deployment.metadata.name.unwrap(), "my-app");
```
No string parsing, no YAML serialization, no fragile assertions against rendered output.
**Preserves Ecosystem Benefits**
By generating standard Helm chart structures, Harmony retains compatibility with:
* **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
* **ArgoCD:** Syncs and manages releases using the generated charts.
* **Existing Workflows:** Teams already consuming Helm charts see no change.
The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
## Consequences
### Positive
* **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
* **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
* **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
* **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
* **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
### Negative
* **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
* **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
* **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
* **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
## Alternatives Considered
### 1. Enhance Askama with Compile-Time Validation
*Pros:* Stay within familiar templating paradigm; minimal code changes.
*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
### 2. Use Helm SDK Programmatically (Go)
*Pros:* Direct access to Helm's template engine; no YAML serialization step.
*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
### 3. Raw YAML String Templating (Manual)
*Pros:* Maximum control; no external dependencies.
*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
### 4. Use Kustomize for All Manifests
*Pros:* Declarative overlays; standard tool.
*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
## Additional Notes
**Scalability to Future Topologies**
The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
**Implementation Status**
As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.

View File

@@ -1,7 +1,7 @@
use std::net::{IpAddr, Ipv4Addr}; use std::net::{IpAddr, Ipv4Addr};
use brocade::{BrocadeOptions, ssh}; use brocade::{BrocadeOptions, ssh};
use harmony_secret::{Secret, SecretManager}; use harmony_secret::Secret;
use harmony_types::switch::PortLocation; use harmony_types::switch::PortLocation;
use schemars::JsonSchema; use schemars::JsonSchema;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View File

@@ -56,6 +56,8 @@ async fn main() {
)), )),
}; };
// TODO exec pod commands to initialize secret store if not already done
harmony_cli::run( harmony_cli::run(
Inventory::autoload(), Inventory::autoload(),
K8sAnywhereTopology::from_env(), K8sAnywhereTopology::from_env(),

View File

@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
opnsense-config-xml = { path = "../opnsense-config-xml" } opnsense-config-xml = { path = "../opnsense-config-xml" }
harmony_macros = { path = "../harmony_macros" } harmony_macros = { path = "../harmony_macros" }
harmony_types = { path = "../harmony_types" } harmony_types = { path = "../harmony_types" }
harmony_execution = { path = "../harmony_execution" }
uuid.workspace = true uuid.workspace = true
url.workspace = true url.workspace = true
kube = { workspace = true, features = ["derive"] } kube = { workspace = true, features = ["derive"] }

View File

@@ -0,0 +1,801 @@
use async_trait::async_trait;
use log::{debug, info, trace};
use serde::Serialize;
use std::path::PathBuf;
use crate::{
config::{REGISTRY_PROJECT, REGISTRY_URL},
modules::application::{
Application, HelmPackage, OCICompliant,
config::ApplicationNetworkPort,
helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
},
};
use harmony_execution::{RunnerOptions, run_command};
#[derive(Debug, Clone, Serialize)]
pub struct BuildCommand {
pub program: String,
pub args: Vec<String>,
}
impl BuildCommand {
pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
Self {
program: program.into(),
args: args.into_iter().map(|s| s.into()).collect(),
}
}
pub fn to_std_command(&self) -> std::process::Command {
let mut cmd = std::process::Command::new(&self.program);
cmd.args(&self.args);
cmd
}
}
#[derive(Debug, Clone, Serialize)]
pub struct BackendApp {
pub name: String,
pub project_root: std::path::PathBuf,
pub network_ports: Vec<ApplicationNetworkPort>,
pub env_vars: Vec<(String, String)>,
pub build_cmd: BuildCommand,
pub dockerfile: Option<PathBuf>,
}
impl BackendApp {
fn get_dockerfile(&self) -> Result<PathBuf, String> {
debug!(
"Looking for dockerfile, currently set to {:?}",
self.dockerfile
);
if let Some(dockerfile) = &self.dockerfile {
return match dockerfile.exists() {
true => {
info!(
"Found dockerfile as intended at {}",
dockerfile.to_string_lossy()
);
Ok(dockerfile.clone())
}
false => Err(format!(
"Dockerfile explicitely set to {dockerfile} does not exist",
dockerfile = dockerfile.to_string_lossy()
)),
};
}
let existing_dockerfile = self.project_root.join("Dockerfile");
debug!("project_root = {:?}", self.project_root);
debug!("checking = {:?}", existing_dockerfile);
if existing_dockerfile.exists() {
debug!(
"Checking path {:#?} for existing Dockerfile",
self.project_root.clone()
);
return Ok(existing_dockerfile);
}
Err(format!(
"Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
project_root = self.project_root.to_string_lossy(),
existing_dockerfile = existing_dockerfile.to_string_lossy(),
))
}
}
impl Application for BackendApp {
fn name(&self) -> String {
self.name.clone()
}
}
#[async_trait]
impl OCICompliant for BackendApp {
async fn build_push_oci_image(&self) -> Result<String, String> {
let dockerfile = self.get_dockerfile()?;
let image_tag = self.image_name();
// Run docker build command, streaming output to console and capturing it
let output = run_command(
std::process::Command::new("docker").args([
"build",
"-t",
&image_tag,
"-f",
&dockerfile.to_string_lossy(),
&self.project_root.to_string_lossy(),
]),
RunnerOptions::print_to_console(),
)
.map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
if output.is_success() {
info!("Docker image build succeeded");
Ok(image_tag)
} else {
Err(format!(
"Docker image build FAILED:\n{}",
output.format_output()
))
}
}
fn local_image_name(&self) -> String {
self.name.clone()
}
fn image_name(&self) -> String {
format!(
"{}/{}/{}",
*REGISTRY_URL,
*REGISTRY_PROJECT,
&self.local_image_name()
)
}
}
#[async_trait]
impl HelmPackage for BackendApp {
fn project_root(&self) -> PathBuf {
self.project_root.clone()
}
fn chart_name(&self) -> String {
self.name.clone()
}
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
// Build the typed Deployment object using the builder with initial options
helm_chart.add_resource(HelmResourceKind::Deployment(
DeploymentBuilder::with_options(
&self.name,
image_url,
Some(self.network_ports.clone()),
Some(self.env_vars.clone()),
None,
)
.build(),
));
// Build the typed Service object using the helper function
if let Some(service) =
helm::create_service_from_ports(self.name.clone(), &self.network_ports)
{
helm_chart.add_resource(HelmResourceKind::Service(service));
}
// Write the Helm chart metadata to the project root
let chart_dir = helm_chart
.write_to(&self.project_root.join(".harmony_generated/helm/"))
.map_err(|e| format!("Failed to write Helm chart: {}", e))?;
info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
Ok(chart_dir.to_string_lossy().to_string())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::modules::application::config::ApplicationNetworkPort;
use crate::modules::application::config::NetworkProtocol;
use k8s_openapi::api::apps::v1::Deployment;
use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
use serde_yaml::from_str;
use std::fs;
use std::path::Path;
use tempfile::tempdir;
// Test Helpers
fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
let path = project_root.join(format!(
".harmony_generated/helm/{chart_name}/templates/service.yaml"
));
let content = fs::read_to_string(&path)
.unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
from_str(&content)
.unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
}
fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
let path = project_root.join(format!(
".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
));
let content = fs::read_to_string(&path)
.unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
from_str(&content)
.unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
}
fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
let path = project_root.join(format!(
".harmony_generated/helm/{chart_name}/templates/service.yaml"
));
path.exists()
}
// Service Assertions
fn assert_service_metadata(service: &K8sService, expected_name: &str) {
assert_eq!(
service.metadata.name.as_deref(),
Some(expected_name),
"Service name should be '{expected_name}'"
);
}
fn assert_service_type(service: &K8sService, expected_type: &str) {
assert_eq!(
service.spec.as_ref().and_then(|s| s.type_.as_deref()),
Some(expected_type),
"Service type should be '{expected_type}'"
);
}
fn assert_service_port_count(service: &K8sService, expected_count: usize) {
let ports = service
.spec
.as_ref()
.and_then(|s| s.ports.as_ref())
.unwrap_or_else(|| panic!("Service should have ports"));
assert_eq!(
ports.len(),
expected_count,
"Service should have {expected_count} ports"
);
}
fn assert_service_port(
port: &ServicePort,
expected_name: &str,
expected_protocol: &str,
expected_number: i32,
) {
assert_eq!(
port.name.as_deref(),
Some(expected_name),
"Port name should be '{expected_name}'"
);
assert_eq!(
port.protocol.as_deref(),
Some(expected_protocol),
"Port '{expected_name}' protocol should be '{expected_protocol}'"
);
assert_eq!(
port.port, expected_number,
"Port '{expected_name}' number should be {expected_number}"
);
}
fn assert_target_port_matches_service_port(port: &ServicePort) {
match &port.target_port {
Some(IntOrString::Int(target)) => {
assert_eq!(
*target,
port.port,
"Target port should match service port for '{}'",
port.name.as_deref().unwrap_or("unknown")
);
}
_ => panic!(
"Target port should be Int for '{}'",
port.name.as_deref().unwrap_or("unknown")
),
}
}
// Deployment Assertions
fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
assert_eq!(
deployment.metadata.name.as_deref(),
Some(expected_name),
"Deployment name should be '{expected_name}'"
);
}
fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
let spec = deployment
.spec
.as_ref()
.unwrap_or_else(|| panic!("Deployment should have spec"));
assert_eq!(
spec.replicas,
Some(expected_replicas),
"Deployment should have {expected_replicas} replicas"
);
}
fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
let spec = deployment
.spec
.as_ref()
.unwrap_or_else(|| panic!("Deployment should have spec"));
assert_eq!(
spec.selector
.match_labels
.as_ref()
.and_then(|m| m.get("app.kubernetes.io/name")),
Some(&expected_label_value.to_string()),
"Selector should match app name '{expected_label_value}'"
);
}
fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
let spec = deployment
.spec
.as_ref()
.unwrap_or_else(|| panic!("Deployment should have spec"));
let metadata = spec
.template
.metadata
.as_ref()
.unwrap_or_else(|| panic!("Pod template should have metadata"));
let labels = metadata
.labels
.as_ref()
.unwrap_or_else(|| panic!("Pod should have labels"));
assert_eq!(
labels.get("app.kubernetes.io/name"),
Some(&expected_name.to_string()),
"Pod label app.kubernetes.io/name should be '{expected_name}'"
);
assert_eq!(
labels.get("app.kubernetes.io/instance"),
Some(&expected_name.to_string()),
"Pod label app.kubernetes.io/instance should be '{expected_name}'"
);
}
// Container Assertions
fn assert_container_metadata(
container: &Container,
expected_name: &str,
expected_image: &str,
expected_pull_policy: &str,
) {
assert_eq!(
container.name, expected_name,
"Container name should be '{expected_name}'"
);
assert_eq!(
container.image.as_deref(),
Some(expected_image),
"Container image should be '{expected_image}'"
);
assert_eq!(
container.image_pull_policy.as_deref(),
Some(expected_pull_policy),
"Image pull policy should be '{expected_pull_policy}'"
);
}
fn assert_container_ports_count(container: &Container, expected_count: usize) {
let ports = container
.ports
.as_ref()
.unwrap_or_else(|| panic!("Container should have ports"));
assert_eq!(
ports.len(),
expected_count,
"Container should have {expected_count} ports"
);
}
fn assert_container_port(
port: &k8s_openapi::api::core::v1::ContainerPort,
expected_name: &str,
expected_protocol: &str,
expected_number: i32,
) {
assert_eq!(
port.name.as_deref(),
Some(expected_name),
"Container port name should be '{expected_name}'"
);
assert_eq!(
port.protocol.as_deref(),
Some(expected_protocol),
"Container port '{expected_name}' protocol should be '{expected_protocol}'"
);
assert_eq!(
port.container_port, expected_number,
"Container port '{expected_name}' number should be {expected_number}"
);
}
fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
let env_vars = container
.env
.as_ref()
.unwrap_or_else(|| panic!("Container should have env vars"));
assert_eq!(
env_vars.len(),
expected_count,
"Container should have {expected_count} env vars"
);
}
fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
assert_eq!(
env_var.name, expected_name,
"Env var name should be '{expected_name}'"
);
assert_eq!(
env_var.value.as_deref(),
Some(expected_value),
"Env var '{expected_name}' value should be '{expected_value}'"
);
}
fn get_container(deployment: &Deployment) -> Container {
let spec = deployment
.spec
.as_ref()
.unwrap_or_else(|| panic!("Deployment should have spec"));
let pod_spec = spec
.template
.spec
.as_ref()
.unwrap_or_else(|| panic!("Pod template should have spec"));
pod_spec
.containers
.first()
.unwrap_or_else(|| panic!("Should have exactly one container"))
.clone()
}
// Test Fixtures
fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
vec![
ApplicationNetworkPort {
number: 8080,
protocol: NetworkProtocol::TCP,
name: "http".to_string(),
},
ApplicationNetworkPort {
number: 9000,
protocol: NetworkProtocol::TCP,
name: "metrics".to_string(),
},
ApplicationNetworkPort {
number: 50051,
protocol: NetworkProtocol::TCP,
name: "grpc".to_string(),
},
]
}
fn standard_test_env_vars() -> Vec<(String, String)> {
vec![
("ENV_VAR_1".to_string(), "value1".to_string()),
("ENV_VAR_2".to_string(), "value2".to_string()),
]
}
fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
vec![
ApplicationNetworkPort {
number: 53,
protocol: NetworkProtocol::UDP,
name: "dns".to_string(),
},
ApplicationNetworkPort {
number: 8080,
protocol: NetworkProtocol::TCP,
name: "http".to_string(),
},
]
}
// Test Builder
struct BackendAppTestBuilder {
name: Option<String>,
network_ports: Option<Vec<ApplicationNetworkPort>>,
env_vars: Option<Vec<(String, String)>>,
}
impl BackendAppTestBuilder {
fn new() -> Self {
Self {
name: None,
network_ports: None,
env_vars: None,
}
}
fn with_name(mut self, name: impl Into<String>) -> Self {
self.name = Some(name.into());
self
}
fn with_standard_ports(mut self) -> Self {
self.network_ports = Some(standard_test_ports());
self
}
fn with_udp_ports(mut self) -> Self {
self.network_ports = Some(udp_test_ports());
self
}
fn with_standard_env_vars(mut self) -> Self {
self.env_vars = Some(standard_test_env_vars());
self
}
fn with_no_ports(mut self) -> Self {
self.network_ports = Some(vec![]);
self
}
fn build(self, project_root: PathBuf) -> BackendApp {
BackendApp {
name: self.name.unwrap_or_else(|| "test-app".to_string()),
project_root,
network_ports: self.network_ports.unwrap_or_default(),
env_vars: self.env_vars.unwrap_or_default(),
build_cmd: BuildCommand::new("cargo", vec!["build"]),
dockerfile: None,
}
}
}
impl Default for BackendAppTestBuilder {
fn default() -> Self {
Self::new()
}
}
// Helper function for test setup
async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
let result = app.build_push_helm_package(image_url).await;
assert!(
result.is_ok(),
"build_push_helm_package should succeed: {:?}",
result
);
}
// ===== SERVICE TESTS =====
#[tokio::test]
async fn service_is_created_with_application_name() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let service = read_service_yaml(&app.project_root, "test-app");
assert_service_metadata(&service, "test-app");
}
#[tokio::test]
async fn service_has_default_clusterip_type() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let service = read_service_yaml(&app.project_root, "test-app");
assert_service_type(&service, "ClusterIP");
}
#[tokio::test]
async fn service_exposes_all_network_ports() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let service = read_service_yaml(&app.project_root, "test-app");
assert_service_port_count(&service, 3);
let ports = service.spec.unwrap().ports.unwrap();
assert_service_port(&ports[0], "http", "TCP", 8080);
assert_service_port(&ports[1], "metrics", "TCP", 9000);
assert_service_port(&ports[2], "grpc", "TCP", 50051);
}
#[tokio::test]
async fn service_target_ports_match_service_ports() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let service = read_service_yaml(&app.project_root, "test-app");
let ports = service.spec.unwrap().ports.unwrap();
for port in &ports {
assert_target_port_matches_service_port(port);
}
}
#[tokio::test]
async fn service_not_created_when_application_has_no_ports() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app-no-ports")
.with_no_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
assert!(
!service_yaml_exists(&app.project_root, "test-app-no-ports"),
"service.yaml should not exist when there are no network ports"
);
}
#[tokio::test]
async fn service_respects_port_protocol_type() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("udp-app")
.with_udp_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
let service = read_service_yaml(&app.project_root, "udp-app");
let ports = service.spec.unwrap().ports.unwrap();
assert_service_port(&ports[0], "dns", "UDP", 53);
assert_service_port(&ports[1], "http", "TCP", 8080);
}
// ===== DEPLOYMENT METADATA TESTS =====
#[tokio::test]
async fn deployment_has_application_name() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
assert_deployment_metadata(&deployment, "test-app");
}
#[tokio::test]
async fn deployment_has_single_replica_by_default() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
assert_deployment_replicas(&deployment, 1);
}
#[tokio::test]
async fn deployment_selector_matches_application_name() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
assert_selector_match_label(&deployment, "test-app");
}
#[tokio::test]
async fn pod_has_standard_kubernetes_labels() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
assert_pod_labels(&deployment, "test-app");
}
// ===== CONTAINER CONFIGURATION TESTS =====
#[tokio::test]
async fn container_has_correct_name_and_image() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
let image_url = "registry.example.com/test/test-app:1.0.0";
build_helm_chart_for_test(&app, image_url).await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
let container = get_container(&deployment);
assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
}
#[tokio::test]
async fn container_exposes_all_application_ports() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
let container = get_container(&deployment);
assert_container_ports_count(&container, 3);
let ports = container.ports.unwrap();
assert_container_port(&ports[0], "http", "TCP", 8080);
assert_container_port(&ports[1], "metrics", "TCP", 9000);
assert_container_port(&ports[2], "grpc", "TCP", 50051);
}
#[tokio::test]
async fn container_has_all_environment_variables() {
let temp_dir = tempdir().expect("Failed to create temp directory");
let app = BackendAppTestBuilder::new()
.with_name("test-app")
.with_standard_ports()
.with_standard_env_vars()
.build(temp_dir.path().to_path_buf());
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
let deployment = read_deployment_yaml(&app.project_root, "test-app");
let container = get_container(&deployment);
assert_container_env_vars_count(&container, 2);
let env_vars = container.env.unwrap();
assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
}
// ===== BUILD COMMAND UNIT TESTS =====
#[test]
fn build_command_creation_sets_program_and_args() {
let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
assert_eq!(cmd.program, "docker");
assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
}
#[test]
fn build_command_clone_copies_all_fields() {
let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
let cmd2 = cmd1.clone();
assert_eq!(cmd1.program, cmd2.program);
assert_eq!(cmd1.args, cmd2.args);
}
}

View File

@@ -0,0 +1,29 @@
use serde::Serialize;
#[derive(Debug, Clone, Serialize)]
pub enum NetworkProtocol {
TCP,
UDP,
}
impl NetworkProtocol {
pub fn as_str(&self) -> &str {
match self {
NetworkProtocol::TCP => "TCP",
NetworkProtocol::UDP => "UDP",
}
}
}
impl std::fmt::Display for NetworkProtocol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ApplicationNetworkPort {
pub number: u16,
pub protocol: NetworkProtocol,
pub name: String,
}

View File

@@ -48,11 +48,11 @@ use crate::{
/// - ArgoCD to install/upgrade/rollback/inspect k8s resources /// - ArgoCD to install/upgrade/rollback/inspect k8s resources
/// - Kubernetes for runtime orchestration /// - Kubernetes for runtime orchestration
#[derive(Debug, Default, Clone)] #[derive(Debug, Default, Clone)]
pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> { pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
pub application: Arc<A>, pub application: Arc<A>,
} }
impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> { impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
async fn deploy_to_local_k3d( async fn deploy_to_local_k3d(
&self, &self,
app_name: String, app_name: String,
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
#[async_trait] #[async_trait]
impl< impl<
A: OCICompliant + HelmPackage + Webapp + Clone + 'static, A: OCICompliant + HelmPackage + Clone + 'static,
T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static, T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
> ApplicationFeature<T> for PackagingDeployment<A> > ApplicationFeature<T> for PackagingDeployment<A>
{ {
@@ -148,24 +148,12 @@ impl<
) -> Result<InstallationOutcome, InstallationError> { ) -> Result<InstallationOutcome, InstallationError> {
let image = self.application.image_name(); let image = self.application.image_name();
let domain = if topology.current_target() == DeploymentTarget::Production {
self.application.dns()
} else {
topology
.get_domain(&self.application.name())
.await
.map_err(|e| e.to_string())?
};
// TODO Write CI/CD workflow files // TODO Write CI/CD workflow files
// we can autotedect the CI type using the remote url (default to github action for github // we can autotedect the CI type using the remote url (default to github action for github
// url, etc..) // url, etc..)
// Or ask for it when unknown // Or ask for it when unknown
let helm_chart = self let helm_chart = self.application.build_push_helm_package(&image).await?;
.application
.build_push_helm_package(&image, &domain)
.await?;
// TODO: Make building image configurable/skippable if image already exists (prompt)") // TODO: Make building image configurable/skippable if image already exists (prompt)")
// https://git.nationtech.io/NationTech/harmony/issues/104 // https://git.nationtech.io/NationTech/harmony/issues/104
@@ -215,12 +203,12 @@ impl<
}; };
Ok(InstallationOutcome::success_with_details(vec![format!( Ok(InstallationOutcome::success_with_details(vec![format!(
"{}: http://{domain}", "{}",
self.application.name() self.application.name()
)])) )]))
} }
fn name(&self) -> String { fn name(&self) -> String {
"ContinuousDelivery".to_string() "PackagingDeployment".to_string()
} }
} }

View File

@@ -0,0 +1,446 @@
// Re-export common Kubernetes types for convenience
pub use k8s_openapi::api::{
apps::v1::{Deployment, DeploymentSpec},
core::v1::{
Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
ServicePort, ServiceSpec,
},
};
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
use kube::core::ObjectMeta;
// Import domain types for the deployment builder
use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
use std::fs;
use std::path::{Path, PathBuf};
/// Enum representing all supported Kubernetes resource types for Helm charts.
/// Supports built-in typed resources and custom CRDs via YAML strings.
pub enum HelmResourceKind {
/// Built-in typed Service resource
Service(K8sService),
/// Built-in typed Deployment resource
Deployment(Deployment),
/// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
CustomYaml { filename: String, content: String },
// Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
}
impl HelmResourceKind {
pub fn filename(&self) -> String {
match self {
HelmResourceKind::Service(_) => "service.yaml".to_string(),
HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
}
}
pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
match self {
HelmResourceKind::Service(s) => serde_yaml::to_string(s),
HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
}
}
pub fn as_service(&self) -> Option<&K8sService> {
match self {
HelmResourceKind::Service(s) => Some(s),
_ => None,
}
}
pub fn as_deployment(&self) -> Option<&Deployment> {
match self {
HelmResourceKind::Deployment(d) => Some(d),
_ => None,
}
}
/// Add a custom resource from any serializable type (e.g., CRDs, custom types)
pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
HelmResourceKind::CustomYaml {
filename: filename.into(),
content: content.into(),
}
}
/// Add a custom resource from any type that implements Serialize
pub fn from_serializable<T: serde::Serialize>(
filename: impl Into<String>,
resource: &T,
) -> Result<Self, serde_yaml::Error> {
Ok(HelmResourceKind::CustomYaml {
filename: filename.into(),
content: serde_yaml::to_string(resource)?,
})
}
}
/// The main orchestrator for building a Helm chart.
pub struct HelmChart {
pub name: String,
pub version: String,
pub app_version: String,
pub description: String,
pub resources: Vec<HelmResourceKind>,
pub values: Vec<String>,
}
impl HelmChart {
pub fn new(name: String, app_version: String) -> Self {
Self {
name: name.clone(),
version: "0.1.0".to_string(),
app_version,
description: format!("A Helm chart for {}", name),
resources: Vec::new(),
values: Vec::new(),
}
}
pub fn add_resource(&mut self, resource: HelmResourceKind) {
self.resources.push(resource);
}
pub fn add_value(&mut self, key: &str, value: &str) {
self.values.push(format!("{}: {}", key, value));
}
pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
let chart_dir = base_path.join(&self.name);
let templates_dir = chart_dir.join("templates");
fs::create_dir_all(&templates_dir)?;
// 1. Render and write Chart.yaml
let chart_yaml = ChartYaml {
name: &self.name,
description: &self.description,
version: &self.version,
app_version: &self.app_version,
};
fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
// 2. Write values.yaml (Constructed dynamically)
let values_content = self.values.join("\n");
fs::write(chart_dir.join("values.yaml"), values_content)?;
// 3. Serialize and write all added resources (Deployment, Service, etc.)
for resource in &self.resources {
let filename = resource.filename();
let content = resource
.serialize_to_yaml()
.map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
fs::write(templates_dir.join(filename), content)?;
}
Ok(chart_dir)
}
}
use askama::Template;
#[derive(Template)]
#[template(path = "helm/Chart.yaml.j2")]
struct ChartYaml<'a> {
name: &'a str,
description: &'a str,
version: &'a str,
app_version: &'a str,
}
/// Builder for creating a Kubernetes Service with proper labels and selectors.
pub struct ServiceBuilder {
name: String,
service_type: String,
ports: Vec<ServicePort>,
selector_label: String,
}
impl ServiceBuilder {
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
service_type: "ClusterIP".to_string(),
ports: Vec::new(),
selector_label: String::new(),
}
}
pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
self.service_type = service_type.into();
self
}
pub fn with_port(
mut self,
name: impl Into<String>,
port: i32,
protocol: impl Into<String>,
) -> Self {
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
self.ports.push(ServicePort {
name: Some(name.into()),
protocol: Some(protocol.into()),
port,
target_port: Some(IntOrString::Int(port)),
..Default::default()
});
self
}
pub fn selector_label(mut self, label: impl Into<String>) -> Self {
self.selector_label = label.into();
self
}
pub fn build(self) -> K8sService {
K8sService {
metadata: ObjectMeta {
name: Some(self.name.clone()),
labels: Some(
[
("app.kubernetes.io/name".to_string(), self.name.clone()),
(
"app.kubernetes.io/component".to_string(),
"service".to_string(),
),
(
"app.kubernetes.io/managed-by".to_string(),
"harmony".to_string(),
),
]
.into(),
),
..Default::default()
},
spec: Some(ServiceSpec {
type_: Some(self.service_type),
selector: Some(
[("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
),
ports: if self.ports.is_empty() {
None
} else {
Some(self.ports)
},
..Default::default()
}),
..Default::default()
}
}
}
/// Builder for creating a Kubernetes Deployment with pod template and container spec.
pub struct DeploymentBuilder {
name: String,
image: String,
replicas: i32,
container_ports: Vec<ContainerPort>,
env_vars: Vec<EnvVar>,
image_pull_policy: Option<String>,
}
impl DeploymentBuilder {
/// Create a new DeploymentBuilder with minimal required fields.
pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
Self::with_options(name, image, None, None, None)
}
/// Create a new DeploymentBuilder with optional initial configuration.
///
/// Arguments:
/// - `name`: The deployment name
/// - `image`: The container image to use
/// - `ports`: Optional vector of initial application network ports
/// - `env_vars`: Optional vector of initial environment variable key-value pairs
/// - `replicas`: Optional number of replicas (defaults to 1)
pub fn with_options(
name: impl Into<String>,
image: impl Into<String>,
ports: Option<Vec<ApplicationNetworkPort>>,
env_vars: Option<Vec<(String, String)>>,
replicas: Option<i32>,
) -> Self {
let container_ports: Vec<ContainerPort> = ports
.unwrap_or_default()
.into_iter()
.map(|port| ContainerPort {
container_port: port.number as i32,
name: Some(port.name),
protocol: Some(port.protocol.to_string()),
..Default::default()
})
.collect();
let k8s_env_vars: Vec<EnvVar> = env_vars
.unwrap_or_default()
.into_iter()
.map(|(key, value)| EnvVar {
name: key,
value: Some(value),
..Default::default()
})
.collect();
Self {
name: name.into(),
image: image.into(),
replicas: replicas.unwrap_or(1),
container_ports,
env_vars: k8s_env_vars,
image_pull_policy: Some("IfNotPresent".to_string()),
}
}
pub fn replicas(mut self, replicas: i32) -> Self {
self.replicas = replicas;
self
}
pub fn with_container_port(
mut self,
number: i32,
name: impl Into<String>,
protocol: impl Into<String>,
) -> Self {
self.container_ports.push(ContainerPort {
container_port: number,
name: Some(name.into()),
protocol: Some(protocol.into()),
..Default::default()
});
self
}
pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
self.env_vars.push(EnvVar {
name: name.into(),
value: Some(value.into()),
..Default::default()
});
self
}
pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
self.image_pull_policy = Some(policy.into());
self
}
pub fn build(self) -> Deployment {
let name = self.name.clone();
Deployment {
metadata: ObjectMeta {
name: Some(name.clone()),
labels: Some(
[
("app.kubernetes.io/name".to_string(), name.clone()),
(
"app.kubernetes.io/component".to_string(),
"deployment".to_string(),
),
(
"app.kubernetes.io/managed-by".to_string(),
"harmony".to_string(),
),
("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
]
.into(),
),
..Default::default()
},
spec: Some(DeploymentSpec {
replicas: Some(self.replicas),
selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
match_labels: Some(
[("app.kubernetes.io/name".to_string(), name.clone())].into(),
),
..Default::default()
},
template: PodTemplateSpec {
metadata: Some(ObjectMeta {
labels: Some(
[
("app.kubernetes.io/name".to_string(), name.clone()),
("app.kubernetes.io/instance".to_string(), name.clone()),
]
.into(),
),
..Default::default()
}),
spec: Some(PodSpec {
containers: vec![Container {
name: name.clone(),
image: Some(self.image),
image_pull_policy: self.image_pull_policy,
ports: if self.container_ports.is_empty() {
None
} else {
Some(self.container_ports)
},
env: if self.env_vars.is_empty() {
None
} else {
Some(self.env_vars)
},
..Default::default()
}],
..Default::default()
}),
},
..Default::default()
}),
..Default::default()
}
}
}
/// Helper function to create a Service from network port configuration.
/// Returns `None` if no ports are provided.
pub fn create_service_from_ports(
name: String,
network_ports: &[ApplicationNetworkPort],
) -> Option<K8sService> {
if network_ports.is_empty() {
return None;
}
let ports: Vec<ServicePort> = network_ports
.into_iter()
.map(|port| ServicePort {
name: Some(port.name.clone()),
protocol: Some(port.protocol.to_string()),
port: port.number as i32,
target_port: Some(IntOrString::Int(port.number as i32)),
..Default::default()
})
.collect();
Some(K8sService {
metadata: ObjectMeta {
name: Some(name.clone()),
labels: Some(
[
("app.kubernetes.io/name".to_string(), name.clone()),
(
"app.kubernetes.io/component".to_string(),
"service".to_string(),
),
(
"app.kubernetes.io/managed-by".to_string(),
"harmony".to_string(),
),
]
.into(),
),
..Default::default()
},
spec: Some(ServiceSpec {
type_: Some("ClusterIP".to_string()),
selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
ports: Some(ports),
..Default::default()
}),
..Default::default()
})
}

View File

@@ -1,5 +1,8 @@
pub mod backend_app;
pub mod config;
mod feature; mod feature;
pub mod features; pub mod features;
pub mod helm;
pub mod oci; pub mod oci;
mod rust; mod rust;
mod webapp; mod webapp;
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
todo!() todo!()
} }
} }
/// Checks the output of a process command for success.
fn check_output(
output: &std::process::Output,
msg: &str,
) -> Result<(), Box<dyn std::error::Error>> {
if !output.status.success() {
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
return Err(error_message.into());
}
Ok(())
}

View File

@@ -1,5 +1,13 @@
use std::path::{Path, PathBuf};
use crate::{
config::{REGISTRY_PROJECT, REGISTRY_URL},
modules::application::check_output,
};
use super::Application; use super::Application;
use async_trait::async_trait; use async_trait::async_trait;
use log::debug;
#[async_trait] #[async_trait]
pub trait OCICompliant: Application { pub trait OCICompliant: Application {
@@ -17,9 +25,74 @@ pub trait HelmPackage: Application {
/// # Arguments /// # Arguments
/// * `image_url` - The full URL of the OCI container image to be used in the Deployment. /// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
/// * `domain` - The domain where the application is hosted. /// * `domain` - The domain where the application is hosted.
async fn build_push_helm_package( async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
&self,
image_url: &str, fn project_root(&self) -> PathBuf;
domain: &str,
) -> Result<String, String>; fn chart_name(&self) -> String;
/// Packages a Helm chart directory into a .tgz file.
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
debug!(
"Launching `helm package {}` cli with CWD {}",
chart_dirname.to_string_lossy(),
&self
.project_root()
.join(".harmony_generated")
.join("helm")
.to_string_lossy()
);
let output = std::process::Command::new("helm")
.args(["package", chart_dirname.to_str().unwrap()])
.current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
.output()?;
check_output(&output, "Failed to package Helm chart")?;
// Helm prints the path of the created chart to stdout.
let tgz_name = String::from_utf8(output.stdout)?
.split_whitespace()
.last()
.unwrap_or_default()
.to_string();
if tgz_name.is_empty() {
return Err("Could not determine packaged chart filename.".into());
}
// The output from helm is relative, so we join it with the execution directory.
Ok(self
.project_root()
.join(".harmony_generated")
.join("helm")
.join(tgz_name))
}
/// Pushes a packaged Helm chart to an OCI registry.
fn push_helm_chart(
&self,
packaged_chart_path: &Path,
) -> Result<String, Box<dyn std::error::Error>> {
// The chart name is the file stem of the .tgz file
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
debug!(
"Pushing Helm chart {} to {}",
packaged_chart_path.to_string_lossy(),
oci_push_url
);
let output = std::process::Command::new("helm")
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
.output()?;
check_output(&output, "Pushing Helm chart failed")?;
// The final URL includes the version tag, which is part of the file name
let version = chart_file_name.rsplit_once('-').unwrap().1;
debug!("pull url {oci_pull_url}");
debug!("push url {oci_push_url}");
Ok(format!("{}:{}", oci_pull_url, version))
}
} }

View File

@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {
#[async_trait] #[async_trait]
impl HelmPackage for RustWebapp { impl HelmPackage for RustWebapp {
async fn build_push_helm_package( fn project_root(&self) -> PathBuf {
&self, self.project_root.clone()
image_url: &str, }
domain: &str,
) -> Result<String, String> { fn chart_name(&self) -> String {
self.name.clone()
}
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
let domain = self.dns();
info!("Starting Helm chart build and push for '{}'", self.name); info!("Starting Helm chart build and push for '{}'", self.name);
// 1. Create the Helm chart files on disk. // 1. Create the Helm chart files on disk.
let chart_dir = self let chart_dir = self
.create_helm_chart_files(image_url, domain) .create_helm_chart_files(image_url, &domain)
.await .await
.map_err(|e| format!("Failed to create Helm chart files: {}", e))?; .map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
info!("Successfully created Helm chart files in {:?}", chart_dir); info!("Successfully created Helm chart files in {:?}", chart_dir);
@@ -327,19 +332,6 @@ impl RustWebapp {
Ok(image_tag.to_string()) Ok(image_tag.to_string())
} }
/// Checks the output of a process command for success.
fn check_output(
&self,
output: &process::Output,
msg: &str,
) -> Result<(), Box<dyn std::error::Error>> {
if !output.status.success() {
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
return Err(error_message.into());
}
Ok(())
}
fn build_builder_image(&self, dockerfile: &mut Dockerfile) { fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
match self.framework { match self.framework {
Some(RustWebFramework::Leptos) => { Some(RustWebFramework::Leptos) => {
@@ -640,71 +632,6 @@ spec:
Ok(chart_dir) Ok(chart_dir)
} }
/// Packages a Helm chart directory into a .tgz file.
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
debug!(
"Launching `helm package {}` cli with CWD {}",
chart_dirname.to_string_lossy(),
&self
.project_root
.join(".harmony_generated")
.join("helm")
.to_string_lossy()
);
let output = process::Command::new("helm")
.args(["package", chart_dirname.to_str().unwrap()])
.current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
.output()?;
self.check_output(&output, "Failed to package Helm chart")?;
// Helm prints the path of the created chart to stdout.
let tgz_name = String::from_utf8(output.stdout)?
.split_whitespace()
.last()
.unwrap_or_default()
.to_string();
if tgz_name.is_empty() {
return Err("Could not determine packaged chart filename.".into());
}
// The output from helm is relative, so we join it with the execution directory.
Ok(self
.project_root
.join(".harmony_generated")
.join("helm")
.join(tgz_name))
}
/// Pushes a packaged Helm chart to an OCI registry.
fn push_helm_chart(
&self,
packaged_chart_path: &Path,
) -> Result<String, Box<dyn std::error::Error>> {
// The chart name is the file stem of the .tgz file
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
debug!(
"Pushing Helm chart {} to {}",
packaged_chart_path.to_string_lossy(),
oci_push_url
);
let output = process::Command::new("helm")
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
.output()?;
self.check_output(&output, "Pushing Helm chart failed")?;
// The final URL includes the version tag, which is part of the file name
let version = chart_file_name.rsplit_once('-').unwrap().1;
debug!("pull url {oci_pull_url}");
debug!("push url {oci_push_url}");
Ok(format!("{}:{}", oci_pull_url, version))
}
fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> { fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
let existing_dockerfile = self.project_root.join("Dockerfile"); let existing_dockerfile = self.project_root.join("Dockerfile");

View File

@@ -0,0 +1,6 @@
apiVersion: v2
name: {{ name }}
description: {{ description }}
type: application
version: {{ version }}
appVersion: "{{ app_version }}"

View File

@@ -0,0 +1,4 @@
.git
data
target
demos

26
harmony_agent/Cargo.toml Normal file
View File

@@ -0,0 +1,26 @@
[package]
name = "harmony_agent"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
harmony = { path = "../harmony" }
# harmony_cli = { path = "../harmony_cli" }
harmony_types = { path = "../harmony_types" }
harmony_macros = { path = "../harmony_macros" }
cidr = { workspace = true }
tokio = { workspace = true }
log = { workspace = true }
env_logger = { workspace = true }
async-nats = "0.45.0"
async-trait = "0.1"
# url = { workspace = true }
serde.workspace = true
serde_json.workspace = true
getrandom = "0.3.4"
thiserror.workspace = true
pretty_assertions.workspace = true

44
harmony_agent/Dockerfile Normal file
View File

@@ -0,0 +1,44 @@
# Build stage
FROM rust:slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy all required packages
COPY . .
RUN ls -la1
# Build the application in release mode
RUN cargo build --release -p harmony_agent
# Runtime stage
FROM debian:bookworm-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy the binary from the builder stage
COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
# Declare environment variables used by the Harmony Agent
# These will be set from build-time environment variables if present
# NATS_URL: URL of the NATS server (default: nats://localhost:4222)
ARG NATS_URL=nats://localhost:4222
ENV NATS_URL=${NATS_URL}
# NATS_CREDS_PATH: Optional path to NATS credentials file
ARG NATS_CREDS_PATH
ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
# MY_CLUSTER_ID: This cluster's unique identifier (required)
ARG MY_CLUSTER_ID
ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
# DESIRED_PRIMARY: The ID of the desired primary cluster (required)
ARG DESIRED_PRIMARY
ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
# Run the application
ENTRYPOINT ["./harmony_agent"]

248
harmony_agent/README.md Normal file
View File

@@ -0,0 +1,248 @@
TODO
DONE:
1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
6. ✅ failover_timeout added to AgentConfig
7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
8. ✅ startup reconciliation implemented via on_startup() method
REMAINING:
- review all code and list implementation issues
- review both workflow for each state transition
- Complete replica workflow staleness detection (needs implementation in Watching state)
- Implement state recovery from Failed state for both workflows
- Implement subscribe in NATS store with watch() API
- Implement config validation for failover_timeout constraints
TODO
1. store trait subscribe definition missing callback
2. BUG, data integrity issue : nats store not actually using jetstream metadata
3. review all code and list implementation issues
4. review both workflow for each state transition
5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
should hold it probably, every agent should be subscribed to the cluster_state object and
keep it in memory to allow workflows to process against it efficiently)
## CRITICAL - Data Integrity Issues
1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
- Currently uses `put()` which overwrites unconditionally
- Must use `update()` with revision parameter for proper compare-and-set
- Without this, concurrent promotion attempts can cause split brain
2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
- Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
- NATS Entry has `.revision` and `.created` fields that must be used
- This defeats the entire purpose of store-provided timestamps
3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
- Line ~156: TODO comment confirms missing metadata passing
- Replica cannot calculate staleness without metadata.timestamp
- Failover logic is broken
4. **No actual cluster state watching exists**
- Replica workflow declares `ClusterState` but never updates it
- No subscription to primary heartbeat or cluster_state key
- Replica cannot detect primary liveness
## HIGH - Missing Core Functionality
5. **Replica Workflow incomplete** - All key logic is TODO:
- Watching primary staleness (line 114)
- Promotion attempt (line 118)
- Original primary recovery detection (line 127)
- Demotion/handshake (line 131)
6. **Missing replica "Failed" state**
- `ReplicaState` enum has no `Failed` variant
- User's TODO #5 correctly identifies this gap
- What happens if replica's own heartbeats fail repeatedly?
7. **Primary Workflow incomplete** - Key logic missing:
- No NATS check before recovering from `Fenced` state (line 95)
- No NATS check in `Yielding` state for demotion handshake (line 101)
- No actual fencing failure handling
8. **Store `subscribe` not implemented** (`store/mod.rs`)
- Returns `todo!()` in NATS implementation
- No callback mechanism defined in trait
- Without this, agents cannot react to state changes
9. **Cluster state not tracked centrally**
- User's TODO #6 correctly identifies this
- Each agent should maintain a local copy of cluster_state
- No subscription mechanism to update this local copy
10. **No validation of configuration constraints**
- Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
- Invalid config could cause split brain
## MEDIUM - Incorrect State Transitions
11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
- Two state transitions happen in one heartbeat cycle
- Should stay in `Failed` until fencing actually completes
- What if fencing fails? State machine won't reflect it
12. **No fencing failure handling**
- If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
- ADR mentions escalating to radical measures, but no callback for failure
13. **Replica `Watching` state does nothing**
- Line 115: Just logs, checks nothing
- Should be checking staleness of primary heartbeat
14. **Demotion handshake not implemented**
- ADR section 4 details this but code doesn't implement it
- How does original primary know it should yield?
## LOW - Observability & Reliability
15. **No graceful shutdown mechanism**
- `run_heartbeat_loop` runs forever
- No signal handling (SIGTERM, SIGINT)
16. **Async task errors silently ignored**
- `tokio::spawn` at lines 74, 83, 123
- No `JoinHandle` retention or error handling
17. **No metrics/observability**
- Only log output
- No Prometheus metrics for state transitions, failure counts, etc.
18. **Hardcoded main() function** (`agent_loop.rs::main`)
- Not production-ready entry point
- Should load config from environment or file
19. **Store factory pattern missing**
- TODO comment at line 54 confirms this
- Can't switch between stores via config
20. **No backoff/retry logic for NATS operations**
- Transient failures could trigger unnecessary fencing
21. **`AgentInfo` status is hardcoded to "HEALTHY"**
- Line 137 in `store_heartbeat`
- Should反映 actual workflow state
22. **Unused fields in structs**
- `HeartbeatState.last_seq` set but never read
- `ClusterState.current_primary` set but never read
## ADR-017-3 Compliance Issues
23. **ADR violation: Clock skew not avoided**
- While ADR says use store metadata, code uses local time
24. **Failover timeout not configurable**
- Defined in ADR but not in `AgentConfig`
- Needed for replica staleness calculation
25. **Safety margin concept exists in ADR but not in code**
- Configuration should include this margin
26. **No handling of Case 3 (Replica Network Lag)**
- ADR describes NATS rejection prevention
- But `set_strict` implementation accepts any write
## Code Quality Issues
27. **Inconsistent error handling**
- Some paths return `Err`, others `todo!()`, others ignore
28. **Unnecessary `Clone` bounds**
- `DeploymentConfig.clone()` used frequently
- Could be optimized with `Arc`
29. **Missing lifetime annotations**
- `KvStore::get` returns `String` key in error - inefficient
30. **No integration points mentioned**
- PostgreSQL lifecycle control implementation missing
- Fencing via CNPG not connected
## Production Readiness Checklist Summary
For battle testing preparation, you need:
**Immediate ( blockers):**
- Fix NATS store metadata usage (issues #1, #2)
- Implement strict set_strict with actual CAS (#1)
- Implement replica primary watching (#4, #5)
- Add failover_timeout config + staleness logic (#3, #24)
- Implement subscribe mechanism with callbacks (#8)
**High priority:**
- Complete all workflow transitions (#5, #7, #11-14)
- Add cluster state tracking (#6, #9)
- Add configuration validation (#10)
- Add Replica Failed state (#6)
**Before deployment:**
- Implement graceful shutdown (#15)
- Add error handling for spawned tasks (#16)
- Remove hardcoded main function (#18)
- Implement store factory (#19)
- Add Prometheus metrics (#17)
**Documentation:**
- Document all configuration parameters and their trade-offs
- Add runbooks for each failure mode
- Document battle test scenarios to cover
### Addendum: Missing Critical Issues
#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
* **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
* **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
* **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
* **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
* **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
* **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
* **Scenario:**
1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
3. `on_active` finishes *before* `on_failover`.
4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
* **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
* **Location:** `agent_loop.rs` loop logic.
* **The Bug:** There is no "Stop the World" gate.
* **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
* **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
* **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
#### 4. HIGH: NATS Bucket Name Collision
* **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
* **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
* **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
* **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
#### 5. HIGH: Startup State Reconciliation
* **Location:** `HarmonyAgent::new`.
* **The Bug:** Agents always start in `Initializing`.
* **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
* **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
* **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
### Summary of Tasks to Add
Please add these to your master list before starting implementation:
28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
* **Think about vacuum / stop-the-world operations**

View File

@@ -0,0 +1,20 @@
[package]
name = "harmony_agent_deploy"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
harmony_types = { path = "../../harmony_types" }
harmony_macros = { path = "../../harmony_macros" }
cidr = { workspace = true }
tokio = { workspace = true }
log = { workspace = true }
env_logger = { workspace = true }
url = { workspace = true }
serde.workspace = true
serde_json.workspace = true

View File

@@ -0,0 +1,63 @@
use harmony::{
inventory::Inventory,
modules::{
application::{
ApplicationScore,
backend_app::{BackendApp, BuildCommand},
features::{Monitoring, PackagingDeployment},
},
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
},
topology::K8sAnywhereTopology,
};
use harmony_macros::hurl;
use harmony_types::k8s_name::K8sName;
use std::{path::PathBuf, sync::Arc};
#[tokio::main]
async fn main() {
let application = Arc::new(BackendApp {
name: "harmony-agent".to_string(),
// Since harmony_agent is part of the harmony workspace, the actual "project root"
// is not harmony_agent folder but the workspace root.
//
// So using ../ here means we MUST run this deployment script from the harmony_agent
// folder
project_root: PathBuf::from("../"),
network_ports: vec![],
env_vars: vec![
("NATS_URL".to_string(), "nats://nats".to_string()),
("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
("NATS_CREDS_PATH".to_string(), "".to_string()),
],
build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
dockerfile: Some(PathBuf::from("Dockerfile")),
});
let app = ApplicationScore {
features: vec![
Box::new(PackagingDeployment {
application: application.clone(),
}),
Box::new(Monitoring {
application: application.clone(),
alert_receiver: vec![Box::new(DiscordWebhook {
name: K8sName("test-discord".to_string()),
url: hurl!("https://discord.doesnt.exist.com"),
selectors: vec![],
})],
}),
],
application,
};
harmony_cli::run(
Inventory::autoload(),
K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
vec![Box::new(app)],
None,
)
.await
.unwrap();
}

View File

@@ -0,0 +1,79 @@
use std::time::Duration;
use harmony_types::id::Id;
use log::info;
use super::heartbeat::HeartbeatFailure;
use super::role::AgentRole;
#[derive(Debug, Clone)]
pub struct AgentConfig {
/// Number of consecutive successful heartbeats required before the service transitions from
/// failed to healthy.
pub success_threshold: usize,
/// Number of consecutive failed heartbeats required before the service transitions from
/// healthy to failed.
pub failure_threshold: usize,
/// Time between each heartbeat. If a heartbeat takes longer than this, it will be
/// considered failed.
pub heartbeat_interval: Duration,
/// Time since last observed primary heartbeat before replica considers primary stale.
/// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
/// to avoid split brain during network partitions.
pub failover_timeout: Duration,
/// **UNSTABLE FIELD**
///
/// For now, an agent instance only serves one deployment. This is probably fine as an agent's
/// footprint is low, but managing multiple deployments in a single instance would be a
/// significant resource usage reduction.
///
/// Decoupling the deployment of the agent with the application's deployment could make things
/// more complicated though, where we would have to be careful about version compatibility
/// between all components managed by the agent instance. So for now it is a 1-1 map.
///
/// But I have a feeling this could change so I am marking this field unstable to warn you, the
/// reader.
pub deployment_config_unstable: DeploymentConfig,
pub nats_url: String,
pub nats_creds_path: Option<String>,
pub agent_id: Id,
pub cluster_id: Id,
pub desired_primary_id: Id,
/// The role this agent plays (Primary or Replica)
pub role: AgentRole,
}
#[derive(Debug, Clone)]
pub enum DeploymentConfig {
FailoverPostgreSQL(FailoverCNPGConfig),
}
#[derive(Debug, Clone)]
pub struct FailoverCNPGConfig {
pub cnpg_cluster_name: String,
}
impl DeploymentConfig {
/// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
match self {
DeploymentConfig::FailoverPostgreSQL(cfg) => {
info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
// TODO: Implement actual PG check / NATS write here
Ok(())
}
}
}
/// Callback: Transitioned from Unhealthy -> Healthy
pub async fn on_active(&self) {
info!("Service is now ACTIVE (Healthy)");
// e.g., Remove fencing lock
}
/// Callback: Transitioned from Healthy -> Unhealthy
pub async fn on_failover(&self) {
info!("Service is now FAILED (Unhealthy)");
// e.g., Initiate self-fencing, stop accepting traffic
}
}

View File

@@ -0,0 +1,35 @@
use harmony_types::id::Id;
use serde::{Deserialize, Serialize};
use crate::store::KvMetadata;
/// Agent-provided heartbeat information (no timestamps - those come from the store)
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct AgentInfo {
pub agent_id: Id,
pub cluster_id: Id,
pub status: String,
}
/// Complete heartbeat with both agent data and store metadata
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct AgentHeartbeat {
pub agent_info: AgentInfo,
pub metadata: Option<KvMetadata>,
}
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ClusterStateData {
pub cluster_info: ClusterState,
pub metadata: Option<KvMetadata>,
}
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ClusterState {
pub cluster_id: Id,
pub current_primary: Option<Id>,
pub desired_primary: Id,
}
#[derive(Debug)]
pub struct HeartbeatFailure {}

View File

@@ -0,0 +1,507 @@
use std::time::{SystemTime, UNIX_EPOCH};
use std::{str::FromStr, sync::Arc, time::Duration};
use harmony_types::id::Id;
use log::{debug, error, info, trace, warn};
use tokio::sync::RwLock;
use tokio::time::{Instant, sleep};
use crate::agent::heartbeat::ClusterState;
use crate::store::{KvMetadata, KvStore, KvStoreError};
use crate::workflow::HeartbeatWorkflow;
use crate::workflow::primary::PrimaryWorkflow;
use crate::workflow::replica::ReplicaWorkflow;
// Submodules
mod config;
pub mod heartbeat;
mod role;
// Re-exports for backwards compatibility
pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
pub use role::AgentRole;
pub async fn launch_agent<S>(
role: AgentRole,
health_kv: Arc<S>,
cluster_kv: Arc<S>,
heartbeat_interval: Duration,
failover_timeout: Duration,
) -> Result<(), Box<dyn std::error::Error>>
where
S: KvStore + Send + Sync + 'static,
{
// Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
// one so they don't crash because of the race
match role {
AgentRole::Primary => {}
AgentRole::Replica => {
sleep(Duration::from_millis(100)).await;
}
}
let my_agent_name = format!("agent-{}", role);
let my_agent_id = Id::from_str(&my_agent_name).unwrap();
let config = AgentConfig {
role,
success_threshold: 2,
failure_threshold: 2,
heartbeat_interval,
failover_timeout,
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
cnpg_cluster_name: String::from("cnpg_cluster_name"),
}),
nats_url: String::new(),
nats_creds_path: None,
agent_id: my_agent_id,
cluster_id: "cluster_test_id".into(),
desired_primary_id: "primary_id".into(),
};
log::info!("Harmony Agent Initialized");
log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
log::info!("Full config : {:?}", config);
// TODO load store based on config, default to nats
// probably a good use case for a factory pattern
let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
agent.reconcile_startup().await?;
// Run the heartbeat loop
agent.run_heartbeat_loop().await;
Ok(())
}
pub struct HarmonyAgent<S: KvStore> {
pub config: AgentConfig,
workflow: Box<dyn HeartbeatWorkflow>,
health_kv: Arc<S>,
cluster_kv: Arc<S>,
/// Last successful heartbeat, used to track sequence number for next write
/// This avoids doing a GET before every SET, reducing network round-trips
last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
/// Local copy of cluster state, updated via subscription
/// This allows workflows to make decisions without querying NATS each time
cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
}
impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
AgentRole::Primary => {
info!("Initializing agent as PRIMARY");
Box::new(PrimaryWorkflow::new(
config.success_threshold,
config.failure_threshold,
config.deployment_config_unstable.clone(),
))
}
AgentRole::Replica => {
info!("Initializing agent as REPLICA");
Box::new(ReplicaWorkflow::new(
config.success_threshold,
config.failure_threshold,
config.cluster_id.clone(),
config.desired_primary_id.clone(),
config.agent_id.clone(),
config.failover_timeout,
))
}
};
Self {
config,
workflow,
health_kv,
cluster_kv,
last_heartbeat: Arc::new(RwLock::new(None)),
cluster_state: Arc::new(RwLock::new(None)),
}
}
/// Generic helper to fetch and deserialize data from KV store
/// Returns Ok(Some(data)) if key exists and deserializes successfully
/// Returns Ok(None) if key doesn't exist
/// Returns Err if deserialization fails or other errors occur
async fn fetch_from_store<D>(
&self,
store: &Arc<S>,
key: &str,
) -> Result<Option<(D, KvMetadata)>, KvStoreError>
where
D: serde::de::DeserializeOwned,
{
debug!("Fetching data from key: {}", key);
let result = store.get(key).await;
debug!("Got result from store: {:#?}", result);
match result {
Ok(kv_result) => {
if let Some(value) = kv_result.value {
match serde_json::from_value::<D>(value.clone()) {
Ok(data) => Ok(Some((data, kv_result.metadata))),
Err(e) => {
log::warn!("Failed to deserialize data from key {}: {}", key, e);
Err(KvStoreError::DeserializationFailed {
deserialization_error: format!(
"Key exists but deserialization failed for {key}: {e}"
),
value: value.to_string(),
})
}
}
} else {
Err(KvStoreError::Unknown(format!(
"Key exists but value is empty for {key}, this should not happen"
)))
}
}
Err(KvStoreError::KeyNotAvailable(_)) => {
debug!("Key {} not found in store", key);
Ok(None)
}
Err(e) => {
log::warn!("Failed to fetch data from key {}: {}", key, e);
Err(e)
}
}
}
/// Reconcile startup state by fetching cluster state and heartbeat from the store
/// This allows the workflow to determine if it should resume as Primary/Replica
/// based on the persisted cluster state
pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
let cluster_key = format!("cluster.{}", self.config.cluster_id);
debug!(
"Fetching cluster state for startup reconciliation from key: {}",
cluster_key
);
let cluster_state_option = match self
.fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
.await?
{
Some((data, metadata)) => Some(ClusterStateData {
cluster_info: data,
metadata: Some(metadata),
}),
None => {
debug!(
"Cluster state key not found, this is a fresh cluster, initializing cluster state"
);
Some(self.store_cluster_state(None).await?)
}
};
debug!("Found cluster state {cluster_state_option:#?}");
self.workflow
.on_startup(cluster_state_option.as_ref(), &self.config)
.await;
// Cache the cluster state locally
*self.cluster_state.write().await = cluster_state_option;
// Fetch last heartbeat if it exists to avoid sequence conflicts
let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
debug!("Fetching last heartbeat from key: {}", heartbeat_key);
let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
let last_heartbeat = match last_heartbeat_option {
Ok(kv_result) => {
let value = kv_result
.value
.expect("When key exist it should always contain data");
Some(AgentHeartbeat {
agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
|e| KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: value.to_string(),
},
)?,
metadata: Some(kv_result.metadata),
})
}
Err(e) => match e {
KvStoreError::KeyNotAvailable(_) => None,
_ => return Err(e),
},
};
if let Some(heartbeat) = &last_heartbeat {
debug!(
"Found existing heartbeat with sequence: {}",
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
);
} else {
debug!("No existing heartbeat found, starting fresh");
}
// Cache the last heartbeat for sequence tracking
*self.last_heartbeat.write().await = last_heartbeat;
Ok(())
}
async fn store_cluster_state(
&self,
cluster_data: Option<ClusterStateData>,
) -> Result<ClusterStateData, KvStoreError> {
let key = format!("cluster.{}", self.config.cluster_id);
match cluster_data {
Some(cluster_data) => {
debug!("found some cluster state {:#?}", cluster_data);
let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: format!("{:?}", cluster_data),
}
})?;
let expected_sequence = {
let last = self.cluster_state.read().await;
last.as_ref()
.and_then(|hb| hb.metadata.as_ref())
.map(|m| m.sequence)
.unwrap_or(0)
};
debug!("expected sequence {:#?}", expected_sequence);
let new_seq = self
.cluster_kv
.set_strict(&key, value, expected_sequence)
.await?;
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
debug!("cluster kv {:#?}", cluster_kv_result);
let cluster_data_new = ClusterStateData {
cluster_info: cluster_data.cluster_info.clone(),
metadata: Some(cluster_kv_result.metadata),
};
*self.cluster_state.write().await = Some(cluster_data_new.clone());
Ok(cluster_data)
}
None => {
let cluster_info = ClusterState {
cluster_id: self.config.cluster_id.clone(),
current_primary: None,
desired_primary: self.config.desired_primary_id.clone(),
};
let value = serde_json::to_value(&cluster_info).map_err(|e| {
KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: format!("{:?}", cluster_info),
}
})?;
let cluster_data = ClusterStateData {
cluster_info,
metadata: None,
};
let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
debug!("cluster kv {:#?}", cluster_kv_result);
let cluster_data_new = ClusterStateData {
cluster_info: cluster_data.cluster_info.clone(),
metadata: Some(cluster_kv_result.metadata),
};
*self.cluster_state.write().await = Some(cluster_data_new.clone());
Ok(cluster_data_new)
}
}
}
/// Sends agent heartbeat to the KV store
///
/// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
/// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
/// comparisons use the store's clock, not agent clocks.
///
/// This method uses the last successful heartbeat's sequence number to avoid an extra
/// GET call before each SET, reducing network round-trips and latency exposure.
async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
let key = format!("heartbeat.{}", self.config.agent_id);
// Create agent info WITHOUT timestamp - the store will add metadata
// Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
let agent_info = AgentInfo {
agent_id: self.config.agent_id.clone(),
cluster_id: self.config.cluster_id.clone(),
status: self.workflow.state_name().to_string(),
};
debug!("Storing heartbeat for agent: {}", self.config.agent_id);
let value =
serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: format!("{:?}", agent_info),
})?;
let expected_sequence = {
let last = self.last_heartbeat.read().await;
last.as_ref()
.and_then(|hb| hb.metadata.as_ref())
.map(|m| m.sequence)
.unwrap_or(0)
};
trace!("Writing new heartbeat {key} (#{expected_sequence}), value: {value:?}");
let new_seq = self
.health_kv
.set_strict(&key, value, expected_sequence)
.await?;
trace!("Got new sequence {new_seq}");
let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
// Construct complete heartbeat with metadata from store
let heartbeat = AgentHeartbeat {
agent_info,
metadata: Some(kv_result.metadata),
};
// Cache this successful heartbeat for next iteration
*self.last_heartbeat.write().await = Some(heartbeat.clone());
Ok(heartbeat)
}
pub async fn run_heartbeat_loop(&mut self) {
let mut next_heartbeat_start;
loop {
let this_heartbeat_start = Instant::now();
next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
// Perform the check via the config/strategy with a timeout
//
// FIXME There is too much stuff happening inside the timeout. There are some things like a
// promotion, that we don't want to cancel within a single heartbeat interval timeout
// I think that the timeout should only apply to the store_heartbeat().await call.
// Logic happening after should not be affected in the exact same manner. There can be
// other timeouts or other stuff to consider here.
// However, the system does rely on heartbeats happening regularly, so we do not want
// to delay the next heartbeat either. This is tricky.
// An idea right now is to keep the heartbeat running but, when a processing event
// occurs, set a flag on the local agent that there is a process running (promotion,
// demotion, etc) and take no other decision until this process is not done. There is
// one exception we can think of right now :
// - a healthy primary starts running a process such as "calling mom"
// - the primary keeps sending its heartbeat to prove to the rest of the cluster that
// it is still healthy
// - then the primary heartbeat fails up to failure_threshold
// - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
// - Another solution would be register the processing: "calling mom" in the primary
// heartbeat store, and prevent the replica from promoting when there is a running
// task on the primary.
let result = tokio::time::timeout(self.config.heartbeat_interval, async {
// Store heartbeat and perform deployment-specific health check
match &self.store_heartbeat().await {
Ok(heartbeat) => {
// Heartbeat stored successfully, already cached by store_heartbeat
debug!(
"Heartbeat stored: seq={}",
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
);
}
Err(KvStoreError::WrongLastRevision) => {
todo!("fetch and update correct last sequence number")
// CAS failure could indicate:
// 1. Network latency: our previous timeout heartbeat actually succeeded
// 2. Agent ID conflict: another agent with same ID exists
// 3. Clock/bucket corruption (unlikely)
// log::warn!(
// "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
// self.config.agent_id, expected, current, current
// );
// // Update cached heartbeat sequence to prevent repeated failures
// if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
// if let Some(metadata) = hb.metadata.as_mut() {
// metadata.sequence = *current;
// }
// }
}
Err(e) => {
// Actual storage failure - treat as heartbeat failure
log::error!("Heartbeat storage error: {}", e);
return Err(HeartbeatFailure {});
}
}
self.config
.deployment_config_unstable
.perform_heartbeat()
.await?;
// TODO: Pass the heartbeat with metadata to the workflow for staleness checks
// The workflow needs access to metadata.timestamp for failover timeout calculations
Ok::<(), HeartbeatFailure>(())
})
.await;
// Update Counters & Handle State Transitions
// Timeout is also treated as a failure
let heartbeat_result = match result {
Ok(inner_result) => inner_result,
Err(_) => Err(HeartbeatFailure {}),
};
trace!("Got heartbeat_result : {heartbeat_result:?}");
match heartbeat_result {
Ok(_) => {
let new_state = self
.workflow
.handle_heartbeat_success(
self.cluster_state.read().await.as_ref(),
&self.config,
)
.await;
if let Some(new_state) = new_state {
warn!("Got new cluster state : {new_state:#?}");
self.store_cluster_state(Some(new_state))
.await
.expect(&format!("cluster state not able to be stored"));
}
}
Err(_) => {
self.workflow
.handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
.await;
}
}
info!(
"Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
success_threshold = self.config.success_threshold,
failure_threshold = self.config.failure_threshold,
state = self.workflow.state_name(),
consecutive_successes = self.workflow.consecutive_successes(),
consecutive_failures = self.workflow.consecutive_failures(),
heartbeat_emoji = if heartbeat_result.is_ok() {
""
} else {
""
},
heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
);
debug!(
"Sleeping for {} ms before next heartbeat",
(next_heartbeat_start - Instant::now()).as_millis()
);
tokio::time::sleep_until(next_heartbeat_start).await;
}
}
}

View File

@@ -0,0 +1,17 @@
use std::fmt;
/// The role of this agent instance
#[derive(Debug, Clone, PartialEq)]
pub enum AgentRole {
Primary,
Replica,
}
impl fmt::Display for AgentRole {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
AgentRole::Primary => write!(f, "primary"),
AgentRole::Replica => write!(f, "replica"),
}
}
}

View File

@@ -0,0 +1,90 @@
use harmony_types::id::Id;
use log::debug;
use std::env;
use std::path::Path;
use std::time::Duration;
/// Configuration for the Harmony Agent
#[derive(Debug, Clone)]
pub struct AgentConfig {
pub nats_url: String,
pub nats_creds_path: Option<String>,
pub my_cluster_id: Id,
pub desired_primary: Id,
pub heartbeat_interval: Duration,
}
pub const NATS_URL: &str = "NATS_URL";
pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
impl AgentConfig {
pub fn load_from_env() -> Result<Self, String> {
let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
// Validate NATS URL is not empty
if nats_url.is_empty() {
return Err(format!("{NATS_URL} cannot be empty"));
}
// Validate NATS URL format
if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
return Err(format!(
"Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
nats_url
));
}
let nats_creds_path = env::var(NATS_CREDS_PATH)
.ok()
.filter(|creds_path| !creds_path.is_empty());
// Validate NATS creds path if provided
if let Some(creds_path) = &nats_creds_path {
debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
let path = Path::new(creds_path);
if !path.exists() {
return Err(format!(
"NATS credentials file does not exist: {}",
creds_path
));
}
if !path.is_file() {
return Err(format!(
"NATS credentials path is not a file: {}",
creds_path
));
}
// Check if file is readable by attempting to read metadata
if std::fs::metadata(path).is_err() {
return Err(format!(
"NATS credentials file is not readable: {}",
creds_path
));
}
}
let my_cluster_id_str = env::var(MY_CLUSTER_ID)
.map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
if my_cluster_id_str.is_empty() {
return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
}
let desired_primary_str = env::var(DESIRED_PRIMARY)
.map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
if desired_primary_str.is_empty() {
return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
}
Ok(Self {
nats_url,
nats_creds_path,
my_cluster_id: my_cluster_id_str.into(),
desired_primary: desired_primary_str.into(),
heartbeat_interval: Duration::from_millis(1000),
})
}
}

82
harmony_agent/src/main.rs Normal file
View File

@@ -0,0 +1,82 @@
use std::{sync::Arc, time::Duration};
use crate::{
agent::AgentRole,
store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
};
// mod agent_loop;
mod agent;
pub mod store;
mod workflow;
#[tokio::main]
async fn main() {
env_logger::init();
let heartbeat_interval = Duration::from_millis(2000);
let failover_timeout = Duration::from_secs(10);
// let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
let nats_store = get_local_nats_store().await;
let health_kv = nats_store.clone();
let cluster_kv = nats_store.clone();
let _ = tokio::join!(
agent::launch_agent(
AgentRole::Primary,
health_kv.clone(),
cluster_kv.clone(),
heartbeat_interval,
failover_timeout
),
agent::launch_agent(
AgentRole::Replica,
health_kv,
cluster_kv,
heartbeat_interval,
failover_timeout
),
);
}
fn get_chaos_store(
heartbeat_interval: &Duration,
failover_timeout: &Duration,
) -> (
Arc<ChaosKvStore<InMemoryKvStore>>,
Arc<ChaosKvStore<InMemoryKvStore>>,
) {
let health_kv = Arc::new(ChaosKvStore::new(
InMemoryKvStore::new(),
10,
10,
heartbeat_interval.as_millis().try_into().unwrap(),
));
let cluster_kv = Arc::new(ChaosKvStore::new(
InMemoryKvStore::new(),
5,
5,
failover_timeout.as_millis().try_into().unwrap(),
));
(health_kv, cluster_kv)
}
async fn get_local_nats_store() -> Arc<NatsKvStore> {
let client = async_nats::connect("localhost").await.unwrap();
let jetstream = async_nats::jetstream::new(client);
let kv = jetstream
.create_key_value(async_nats::jetstream::kv::Config {
bucket: "kv".to_string(),
history: 10,
..Default::default()
})
.await
.unwrap();
let status = kv.status().await.unwrap();
println!("status: {:?}", status);
Arc::new(NatsKvStore::new(kv))
}

View File

@@ -0,0 +1,142 @@
use async_trait::async_trait;
use log::{debug, trace, warn};
use serde_json::Value;
use std::sync::Arc;
use tokio::time::Duration;
use crate::store::SubscriptionCallback;
use super::{KvStore, KvStoreError};
/// A chaos testing KV store that randomly times out or fails
/// Wraps another KvStore implementation and adds random failures
#[derive(Clone)]
pub struct ChaosKvStore<T: KvStore> {
inner: Arc<T>,
timeout_probability_percent: u32,
failure_probability_percent: u32,
max_delay_ms: u64,
}
impl<T: KvStore> ChaosKvStore<T> {
pub fn new(
inner: T,
timeout_probability_percent: u32,
failure_probability_percent: u32,
max_delay_ms: u64,
) -> Self {
Self {
inner: Arc::new(inner),
timeout_probability_percent,
failure_probability_percent,
max_delay_ms,
}
}
async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
trace!("Calculating chaos");
// Random delay
let delay = getrandom::u64().unwrap() % self.max_delay_ms;
let delay = Duration::from_millis(delay);
trace!("Sleeping until chaos maybe happens {delay:?}");
tokio::time::sleep(delay).await;
// Random failure
let failure_random = getrandom::u32().unwrap() % 100;
if failure_random < self.failure_probability_percent {
warn!(
"Chaos causes an error : {failure_random} < {}",
self.failure_probability_percent
);
return Err(KvStoreError::Unknown(format!(
"Randomly failed thanks to chaos store with {}% chances, got {}",
self.failure_probability_percent, failure_random
)));
}
// Random timeout (simulated as a very long delay)
let failure_random = getrandom::u32().unwrap() % 100;
if failure_random < self.timeout_probability_percent {
warn!(
"Chaos caused a timeout : {failure_random} < {}",
self.failure_probability_percent
);
tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
}
Ok(())
}
}
#[async_trait]
impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
self.maybe_chaos().await?;
self.inner.get(key).await
}
async fn get_revision(
&self,
key: &str,
expected_seq: u64,
) -> Result<super::KvResult, KvStoreError> {
self.maybe_chaos().await?;
self.inner.get_revision(key, expected_seq).await
}
async fn set_strict(
&self,
key: &str,
value: Value,
expected_sequence: u64,
) -> Result<u64, KvStoreError> {
self.maybe_chaos().await?;
self.inner.set_strict(key, value, expected_sequence).await
}
async fn subscribe(
&self,
key: &str,
callback: SubscriptionCallback,
) -> Result<(), KvStoreError> {
self.maybe_chaos().await?;
self.inner.subscribe(key, callback).await
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::store::InMemoryKvStore;
use serde_json::json;
#[tokio::test]
async fn test_chaos_store_with_no_chaos() {
let inner = InMemoryKvStore::new();
let chaos = ChaosKvStore::new(inner, 0, 0, 1);
let value = json!({"test": "value"});
let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
assert_eq!(result, 1);
let retrieved = chaos.get("key").await.unwrap();
assert_eq!(retrieved.value, Some(value));
}
#[tokio::test]
async fn test_chaos_store_with_delay() {
let inner = InMemoryKvStore::new();
let chaos = ChaosKvStore::new(inner, 0, 0, 100);
let start = tokio::time::Instant::now();
let value = json!({"test": "value"});
chaos.set_strict("key", value, 0).await.unwrap();
let elapsed = start.elapsed();
// Should have some delay
assert!(
elapsed.as_millis() < 150,
"Should complete within reasonable time"
);
}
}

View File

@@ -0,0 +1,196 @@
use async_trait::async_trait;
use log::{debug, trace};
use serde_json::Value;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::sync::RwLock;
use crate::store::SubscriptionCallback;
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
/// An in-memory KV store that guarantees ordering like NATS JetStream
/// Each key maintains a full history of all writes, where the sequence number
/// is the length of the history (1-indexed)
#[derive(Clone)]
pub struct InMemoryKvStore {
data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
}
impl InMemoryKvStore {
pub fn new() -> Self {
Self {
data: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Get the latest sequence number for a key (length of history)
pub async fn get_seq(&self, key: &str) -> Option<u64> {
self.data.read().await.get(key).map(|vec| vec.len() as u64)
}
/// Get the value at a specific revision for a key
pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
let data = self.data.read().await;
let entries = data
.get(key)
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
// Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
if seq == 0 || seq > entries.len() as u64 {
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
}
let (value, timestamp) = entries[seq as usize - 1].clone();
Ok(KvResult {
value: Some(value.clone()),
metadata: KvMetadata {
timestamp,
sequence: seq,
},
})
}
}
impl Default for InMemoryKvStore {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl KvStore for InMemoryKvStore {
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
self.get_revision(key, expected_seq).await
}
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
let data = self.data.read().await;
let entries = data
.get(key)
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
let (value, timestamp) = entries.last().unwrap();
Ok(KvResult {
value: Some(value.clone()),
metadata: KvMetadata {
timestamp: *timestamp,
sequence: entries.len() as u64,
},
})
}
async fn set_strict(
&self,
key: &str,
value: Value,
expected_sequence: u64,
) -> Result<u64, KvStoreError> {
// Check current sequence (length of history for this key)
let data = self.data.read().await;
// This implemenetation does not seem to match the NATS sequence. In nats the
// sequence updates one counter per bucket. This impl creates a counter per key
let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
drop(data);
// Verify expected sequence matches
if current_sequence != expected_sequence {
trace!("{current_sequence} != {expected_sequence}");
return Err(KvStoreError::WrongLastRevision);
}
// Get current timestamp
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went backwards")
.as_millis() as u64;
// Append to the history
let mut data = self.data.write().await;
data.entry(key.to_string())
.or_insert_with(Vec::new)
.push((value.clone(), timestamp));
let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
debug!(
"Successfully inserted {key}(rev#{new_seq}) : {value}",
value = value.to_string()
);
Ok(new_seq)
}
async fn subscribe(
&self,
key: &str,
callback: SubscriptionCallback,
) -> Result<(), KvStoreError> {
// For now, subscribe just returns the current value
// In a real implementation, this would return a stream of updates
self.get(key).await;
todo!() // register callback and call it when key is set ?
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[tokio::test]
async fn test_memory_store_basic() {
let store = InMemoryKvStore::new();
// Set a value
let value = json!({"status": "healthy"});
let result = store
.set_strict("test_key", value.clone(), 0)
.await
.unwrap();
assert_eq!(result, 1);
// Get the value
let retrieved = store.get("test_key").await.unwrap();
assert_eq!(retrieved.value, Some(value));
assert_eq!(retrieved.metadata.sequence, 1);
}
#[tokio::test]
async fn test_memory_store_sequence_numbers() {
let store = InMemoryKvStore::new();
let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
assert!(seq2 > seq1, "Sequence numbers should increment");
}
#[tokio::test]
async fn test_memory_store_key_not_found() {
let store = InMemoryKvStore::new();
let result = store.get("nonexistent").await;
assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
}
#[tokio::test]
async fn test_memory_store_strict_ordering() {
let store = InMemoryKvStore::new();
// First write with sequence 0
let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
assert_eq!(result1, 1);
// Second write with correct sequence
let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
assert_eq!(result2, 2);
// Third write with wrong sequence should fail
let result3 = store.set_strict("key", json!("value3"), 1).await;
assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
}
}

View File

@@ -0,0 +1,120 @@
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use thiserror::Error;
/// Handle for managing active subscriptions
#[derive(Debug, Clone)]
pub struct SubscriptionHandle {
id: usize,
_phantom: std::marker::PhantomData<()>,
}
/// Metadata returned by the KV store for all operations
/// Contains timing and ordering information set by the store
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct KvMetadata {
/// Timestamp set by the store (milliseconds since UNIX epoch)
pub timestamp: u64,
/// Sequence number for strict ordering guarantees
pub sequence: u64,
}
/// Result returned by KV store operations
/// Contains both the value (if any) and store metadata
#[derive(Debug, Clone)]
pub struct KvResult {
/// The value from the store (None if key doesn't exist)
pub value: Option<Value>,
/// Store-provided metadata (timestamp, sequence)
pub metadata: KvMetadata,
}
/// Callback type for subscription updates
/// Callback receives: key, new value (None if deleted), and metadata
pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
#[derive(Error, Debug)]
pub enum KvStoreError {
#[error("data store disconnected")]
Disconnect(#[from] std::io::Error),
#[error("invalid key")]
InvalidKey,
#[error("operation timed out")]
Timeout,
#[error("the data for key `{0}` is not available")]
KeyNotAvailable(String),
#[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
DeserializationFailed {
deserialization_error: String,
value: String,
},
#[error("Strict ordering violation, wrong last sequence number")]
WrongLastRevision,
#[error("unknown data store error {0}")]
Unknown(String),
}
#[async_trait]
pub trait KvStore {
/// Get a value from the store
///
/// # Returns
/// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
/// - `Err(KeyNotAvailable)`: If the key doesn't exist
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
/// Strict set operation with compare-and-set semantics
///
/// Sets the value only if the current sequence number matches `expected_sequence`.
/// This provides strict ordering guarantees needed for the failover algorithm.
///
/// # Parameters
/// - `key`: The key to set
/// - `value`: The value to store
/// - `expected_sequence`: The sequence number we expect the key to currently have.
/// Use 0 for the first write to a new key.
///
/// # Returns
/// - `Ok(u64)`: Returns the new sequence number
/// - `Err(KvStoreError)`: If another write happened (current != expected)
///
/// # Example Use Case
/// For NATS JetStream, this maps to the conditional update operation that ensures
/// only one agent can successfully promote to primary.
async fn set_strict(
&self,
key: &str,
value: Value,
expected_sequence: u64,
) -> Result<u64, KvStoreError>;
/// Subscribe to updates for a key
///
/// # Parameters
/// - `key`: The key to subscribe to
/// - `callback`: Function to call on each update with key, value, and metadata
///
/// # Returns
/// - `Ok(())`: Subscription established successfully
/// - `Err(KvStoreError)`: Subscription failed
///
/// Note: For JetStream, this should use watch() API. Updates will invoke the callback
/// asynchronously in the background.
async fn subscribe(
&self,
key: &str,
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
// callback
) -> Result<(), KvStoreError>;
}
mod chaos;
mod memory;
mod nats;
pub use chaos::ChaosKvStore;
pub use memory::InMemoryKvStore;
pub use nats::NatsKvStore;

View File

@@ -0,0 +1,179 @@
use async_nats::jetstream::kv::{Store, UpdateError};
use async_trait::async_trait;
use log::{debug, error, trace};
use serde_json::Value;
use crate::store::SubscriptionCallback;
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
/// NATS JetStream-backed KV store
pub struct NatsKvStore {
store: Store,
}
impl NatsKvStore {
pub fn new(store: Store) -> Self {
Self { store }
}
pub async fn create(
client: async_nats::Client,
bucket_name: &str,
history_size: i64,
) -> Result<Self, Box<dyn std::error::Error>> {
let jetstream = async_nats::jetstream::new(client);
debug!("Creating NATS KV bucket: {}", bucket_name);
let store = jetstream
.create_key_value(async_nats::jetstream::kv::Config {
bucket: bucket_name.to_string(),
history: history_size,
..Default::default()
})
.await
.map_err(|e| {
error!(
"Failed to initialize NATS KV bucket '{}': {}",
bucket_name, e
);
e
})?;
Ok(Self::new(store))
}
}
#[async_trait]
impl KvStore for NatsKvStore {
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
let entry = self
.store
.entry_for_revision(key, expected_seq)
.await
.map_err(|e| {
error!("NATS get failed for key '{}': {}", key, e);
KvStoreError::Disconnect(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
))
})?;
if entry.is_none() {
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
}
let entry = entry.unwrap();
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: String::from_utf8_lossy(&entry.value).to_string(),
}
})?;
// Extract metadata from NATS entry
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
let metadata = KvMetadata {
timestamp,
sequence: entry.revision,
};
Ok(KvResult {
value: Some(value),
metadata,
})
}
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
let entry = self.store.entry(key).await.map_err(|e| {
error!("NATS get failed for key '{}': {}", key, e);
KvStoreError::Disconnect(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
))
})?;
if entry.is_none() {
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
}
let entry = entry.unwrap();
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: String::from_utf8_lossy(&entry.value).to_string(),
}
})?;
// Extract metadata from NATS entry
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
let metadata = KvMetadata {
timestamp,
sequence: entry.revision,
};
Ok(KvResult {
value: Some(value),
metadata,
})
}
async fn set_strict(
&self,
key: &str,
value: Value,
expected_sequence: u64,
) -> Result<u64, KvStoreError> {
trace!(
"Nats set strict {key} (#{expected_sequence}) : {}",
value.to_string()
);
let bytes =
serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
deserialization_error: e.to_string(),
value: value.to_string(),
})?;
// Use update() for CAS semantics (Compare-And-Set)
// This ensures we only write if the revision matches expected_sequence
let revision = self
.store
.update(&key, bytes.into(), expected_sequence)
.await
.map_err(|e| {
// FIXME this is ugly, we should have a clean KvStoreError containing
// proper information from nats instead
error!("NATS update failed for key '{}': {}", key, e);
e
})?;
Ok(revision)
}
async fn subscribe(
&self,
key: &str,
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
) -> Result<(), KvStoreError> {
todo!()
}
}
impl From<UpdateError> for KvStoreError {
fn from(value: UpdateError) -> Self {
match value.kind() {
async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
KvStoreError::WrongLastRevision
}
async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
),
}
}
}

View File

@@ -0,0 +1,39 @@
use std::sync::Arc;
use crate::agent::AgentConfig;
use async_trait::async_trait;
pub mod primary;
pub mod replica;
/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
#[async_trait]
pub trait HeartbeatWorkflow: Send + Sync {
/// Handle a successful heartbeat
async fn handle_heartbeat_success(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
agent_config: &AgentConfig,
) -> Option<crate::agent::ClusterStateData>;
/// Handle a failed heartbeat
async fn handle_heartbeat_failure(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
);
async fn on_startup(
&self,
cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
agent_config: &AgentConfig,
);
/// Get the current state name for logging (also used for heartbeat status)
fn state_name(&self) -> &'static str;
/// Get current consecutive successes
fn consecutive_successes(&self) -> usize;
/// Get current consecutive failures
fn consecutive_failures(&self) -> usize;
}

View File

@@ -0,0 +1,330 @@
use async_trait::async_trait;
use log::{debug, info, trace, warn};
use crate::{
agent::{AgentConfig, DeploymentConfig},
workflow::HeartbeatWorkflow,
};
#[derive(Debug, Clone, PartialEq)]
pub enum PrimaryState {
Initializing,
Healthy,
Failed,
Fenced,
Yielding,
}
impl PrimaryState {
pub fn name(&self) -> &'static str {
match self {
PrimaryState::Initializing => "Primary:Initializing",
PrimaryState::Healthy => "Primary:Healthy",
PrimaryState::Failed => "Primary:Failed",
PrimaryState::Fenced => "Primary:Fenced",
PrimaryState::Yielding => "Primary:Yielding",
}
}
}
pub struct PrimaryWorkflow {
state: PrimaryState,
consecutive_successes: usize,
consecutive_failures: usize,
// TODO these thresholds should not be copied into the workflow struct. They are configuration
// level and should always be read from the context passed to the workflow functions
success_threshold: usize,
failure_threshold: usize,
// TODO not sure if this should be known by the workflow or passed in the context to function
// calls or just completely handled by the agent ?
deployment_config: DeploymentConfig,
}
impl PrimaryWorkflow {
pub fn new(
success_threshold: usize,
failure_threshold: usize,
deployment_config: DeploymentConfig,
) -> Self {
Self {
state: PrimaryState::Initializing,
consecutive_successes: 0,
consecutive_failures: 0,
success_threshold,
failure_threshold,
deployment_config,
}
}
fn transition_to(&mut self, new_state: PrimaryState) {
if self.state != new_state {
info!(
"State transition: {} -> {}",
self.state.name(),
new_state.name()
);
self.state = new_state;
}
}
}
#[async_trait]
impl HeartbeatWorkflow for PrimaryWorkflow {
async fn on_startup(
&self,
cluster_state: Option<&crate::agent::ClusterStateData>,
agent_config: &AgentConfig,
) {
if let Some(state) = cluster_state {
info!(
"Startup reconciliation: current primary is {:?}, desired primary is {:?}",
state.cluster_info.current_primary, state.cluster_info.desired_primary
);
// No automatic fast-tracking - agent must earn healthy status
// through successful heartbeats. This prevents duplicate agents
// or crashloop agents from incorrectly claiming primary.
} else {
debug!("No cluster state on startup, starting from Initializing");
}
}
async fn handle_heartbeat_success(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
agent_config: &AgentConfig,
) -> Option<crate::agent::ClusterStateData> {
trace!(
"Handling heartbeat success, current counters success {} failures {}",
self.consecutive_successes, self.consecutive_failures
);
self.consecutive_successes += 1;
self.consecutive_failures = 0;
match self.state {
PrimaryState::Initializing => {
if self.consecutive_successes >= self.success_threshold {
self.transition_to(PrimaryState::Healthy);
// Trigger on_active callback
let config = self.deployment_config.clone();
tokio::spawn(async move {
config.on_active().await;
});
if let Some(state) = cluster_state
&& state.cluster_info.desired_primary == agent_config.desired_primary_id
{
debug!("state {:#?}", state);
let mut new_state = state.clone();
new_state.cluster_info.current_primary =
Some(agent_config.agent_id.clone());
return Some(new_state);
} else {
todo!(
"I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
);
}
}
None
}
PrimaryState::Failed => {
if self.consecutive_successes >= self.success_threshold {
self.transition_to(PrimaryState::Healthy);
let config = self.deployment_config.clone();
tokio::spawn(async move {
config.on_active().await;
});
}
todo!()
}
PrimaryState::Healthy => {
// Stay healthy
debug!("Primary staying healthy");
None
}
PrimaryState::Fenced => {
// Recovery from fenced state
if self.consecutive_successes >= self.success_threshold {
// TODO: Check NATS for current_primary status before recovering
info!("Recovered from fenced state, transitioning to yielding");
self.transition_to(PrimaryState::Yielding);
}
todo!()
}
PrimaryState::Yielding => {
// TODO: Check NATS to see if we can resume as primary
trace!("Yielding, waiting for demotion handshake");
todo!()
}
}
}
async fn handle_heartbeat_failure(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
) {
self.consecutive_failures += 1;
self.consecutive_successes = 0;
match self.state {
PrimaryState::Healthy => {
if self.consecutive_failures >= self.failure_threshold {
warn!(
"Failure threshold reached ({}/{}), transitioning to Failed",
self.consecutive_failures, self.failure_threshold
);
self.transition_to(PrimaryState::Failed);
// Immediately fence
self.transition_to(PrimaryState::Fenced);
let config = self.deployment_config.clone();
tokio::spawn(async move {
config.on_failover().await;
});
}
}
PrimaryState::Initializing => {
// Stay in initializing, just accumulate failures
trace!("Heartbeat failed during initialization");
}
PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
// Already in a degraded state
trace!("Heartbeat failed in degraded state: {}", self.state.name());
}
}
}
fn state_name(&self) -> &'static str {
self.state.name()
}
fn consecutive_successes(&self) -> usize {
self.consecutive_successes
}
fn consecutive_failures(&self) -> usize {
self.consecutive_failures
}
}
#[cfg(test)]
mod test {
use harmony_types::id::Id;
use std::time::Duration;
use crate::agent::{AgentRole, FailoverCNPGConfig};
use pretty_assertions::assert_eq;
use super::*;
#[tokio::test]
async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
assert!(
primary
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
.await
.is_none()
);
}
#[tokio::test]
async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
let mut expected_state = cluster_state.clone();
expected_state.cluster_info.current_primary = Some(Id::empty());
assert_eq!(
primary
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
.await,
None
);
assert_eq!(
primary
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
.await,
Some(expected_state)
);
}
#[tokio::test]
async fn primary_stays_healthy_below_failure_threshold() {
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
// Reach healthy
let _ = primary
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
.await;
assert_eq!(primary.state, PrimaryState::Healthy);
// One failure below threshold
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
assert_eq!(primary.state, PrimaryState::Healthy);
assert_eq!(primary.consecutive_failures(), 1);
assert_eq!(primary.consecutive_successes(), 0);
}
#[tokio::test]
async fn primary_transitions_to_failed_at_failure_threshold() {
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
// Reach healthy
let _ = primary
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
.await;
assert_eq!(primary.state, PrimaryState::Healthy);
// First failure, still healthy
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
assert_eq!(primary.state, PrimaryState::Healthy);
assert_eq!(primary.consecutive_failures(), 1);
// Second failure reaches threshold, transitions to Failed
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
assert_eq!(primary.state, PrimaryState::Fenced);
assert_eq!(primary.consecutive_failures(), 2);
assert_eq!(primary.consecutive_successes(), 0);
}
fn default_test_state(
success_threshold: usize,
failure_threshold: usize,
) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
let cluster_state = crate::agent::ClusterStateData {
cluster_info: crate::agent::heartbeat::ClusterState {
cluster_id: Id::empty(),
current_primary: None,
desired_primary: Id::empty(),
},
metadata: None,
};
let agent_config = AgentConfig {
success_threshold,
failure_threshold,
heartbeat_interval: Duration::from_nanos(0),
failover_timeout: Duration::from_nanos(0),
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
cnpg_cluster_name: "test".to_string(),
}),
nats_url: String::new(),
nats_creds_path: None,
agent_id: Id::empty(),
cluster_id: Id::empty(),
desired_primary_id: Id::empty(),
role: AgentRole::Primary,
};
let primary = PrimaryWorkflow::new(
agent_config.success_threshold,
agent_config.failure_threshold,
agent_config.deployment_config_unstable.clone(),
);
(primary, cluster_state, agent_config)
}
}

View File

@@ -0,0 +1,279 @@
use async_trait::async_trait;
use harmony_types::id::Id;
use log::{debug, error, info, trace, warn};
use std::time::Duration;
use tokio::sync::RwLock;
use crate::agent::{AgentConfig, AgentHeartbeat};
use crate::workflow::HeartbeatWorkflow;
#[derive(Debug, Clone)]
pub struct HeartbeatState {
pub agent_id: Id,
pub last_seq: Option<u64>,
}
impl HeartbeatState {
pub fn watch(agent_id: Id) -> Self {
Self {
agent_id,
last_seq: None,
}
}
}
#[derive(Debug, Clone)]
pub struct ClusterState {
pub cluster_id: Id,
pub current_primary: Option<Id>,
}
impl ClusterState {
pub fn watch(cluster_id: Id) -> Self {
Self {
cluster_id,
current_primary: None,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum ReplicaState {
Initializing,
Watching,
Promoting,
PromotionFailed,
Leader,
Demoting,
Failed,
}
impl ReplicaState {
pub fn name(&self) -> &'static str {
match self {
ReplicaState::Initializing => "Replica:Initializing",
ReplicaState::Watching => "Replica:Watching",
ReplicaState::Promoting => "Replica:Promoting",
ReplicaState::PromotionFailed => "Replica:PromotionFailed",
ReplicaState::Leader => "Replica:Leader",
ReplicaState::Demoting => "Replica:Demoting",
ReplicaState::Failed => "Replica:Failed",
}
}
}
pub struct ReplicaWorkflow {
state: ReplicaState,
heartbeat_state: HeartbeatState,
primary_state: HeartbeatState,
cluster_state: ClusterState,
consecutive_successes: usize,
consecutive_failures: usize,
success_threshold: usize,
failure_threshold: usize,
failover_timeout: Duration,
/// Our own last heartbeat (for timestamp comparison against primary)
last_my_heartbeat: Option<AgentHeartbeat>,
/// Last observed primary heartbeat (metadata only, for staleness detection)
last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
}
impl ReplicaWorkflow {
pub fn new(
success_threshold: usize,
failure_threshold: usize,
cluster_id: Id,
primary_id: Id,
my_id: Id,
failover_timeout: Duration,
) -> Self {
Self {
state: ReplicaState::Initializing,
consecutive_successes: 0,
consecutive_failures: 0,
success_threshold,
failure_threshold,
failover_timeout,
cluster_state: ClusterState::watch(cluster_id),
primary_state: HeartbeatState::watch(primary_id),
heartbeat_state: HeartbeatState::watch(my_id),
last_my_heartbeat: None,
last_primary_heartbeat: None,
}
}
fn transition_to(&mut self, new_state: ReplicaState) {
if self.state != new_state {
info!(
"State transition: {} -> {}",
self.state.name(),
new_state.name()
);
self.state = new_state;
}
}
/// Check if the primary heartbeat is stale compared to our own
/// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
async fn is_primary_stale(&mut self) -> bool {
if let Some(my_hb) = &self.last_my_heartbeat {
if let Some(my_metadata) = &my_hb.metadata {
if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
let primary_hb = primary_hb_ref.read().await;
if let Some(primary_metadata) = &primary_hb.metadata {
// Calculate time difference: replica_timestamp - primary_timestamp
let time_diff_ms = my_metadata
.timestamp
.saturating_sub(primary_metadata.timestamp);
let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
trace!(
"Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
my_metadata.timestamp,
primary_metadata.timestamp,
time_diff_ms,
failover_timeout_ms
);
if time_diff_ms > failover_timeout_ms {
info!(
"Primary heartbeat stale ({}ms > {}ms), attempting promotion",
time_diff_ms, failover_timeout_ms
);
return true;
}
}
}
}
}
false
}
}
#[async_trait]
impl HeartbeatWorkflow for ReplicaWorkflow {
async fn on_startup(
&self,
cluster_state: Option<&crate::agent::ClusterStateData>,
agent_config: &AgentConfig,
) {
// todo!("not sure if the replica should do anything on startup")
}
async fn handle_heartbeat_success(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
agent_config: &AgentConfig,
) -> Option<crate::agent::ClusterStateData> {
trace!(
"Handling heartbeat success, current counters success {} failures {}",
self.consecutive_successes, self.consecutive_failures
);
self.consecutive_successes += 1;
self.consecutive_failures = 0;
match self.state {
ReplicaState::Initializing => {
if self.consecutive_successes >= self.success_threshold {
self.transition_to(ReplicaState::Watching);
}
None
}
ReplicaState::Watching => {
// TODO: Check primary staleness from NATS
trace!("Replica watching primary");
if self.is_primary_stale().await {
panic!("Found stale primary, launching promotion");
}
debug!("perform the replica watch actions :
- if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
- check the last primary heartbeat kv timestamp
- compare it with our latest kv heartbeat
- if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
- launching promotion will change the status of the replica
");
None
}
ReplicaState::Promoting => {
// TODO: Complete promotion attempt
trace!("Replica promotion in progress");
todo!(
"When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
);
}
ReplicaState::PromotionFailed => {
if self.consecutive_successes >= self.success_threshold {
self.transition_to(ReplicaState::Watching);
}
todo!()
}
ReplicaState::Leader => {
// TODO: Check for original primary recovery
trace!("Replica acting as leader");
todo!()
}
ReplicaState::Failed => {
if self.consecutive_successes >= self.success_threshold {
info!("Replica recovered from Failed state, transitioning to Watching");
self.transition_to(ReplicaState::Watching);
}
todo!()
}
ReplicaState::Demoting => {
// TODO: Complete demotion back to watching
trace!("Replica demotion in progress");
todo!()
}
}
}
async fn handle_heartbeat_failure(
&mut self,
cluster_state: Option<&crate::agent::ClusterStateData>,
) {
self.consecutive_failures += 1;
self.consecutive_successes = 0;
// TODO revisit this. I think we should handle the agent healthiness (checking
// consecutive_failures against failure_threshold) separately from handling the cluster
// state.
//
// That said, there might be funny stuff we have to do when the agent reaches the failure
// threshold, especially in promoting and demoting statuses.
match self.state {
ReplicaState::Watching | ReplicaState::Initializing => {
if self.consecutive_failures >= self.failure_threshold {
info!(
"Replica exceeded failure threshold ({}/{}), transitioning to Failed",
self.consecutive_failures, self.failure_threshold
);
self.transition_to(ReplicaState::Failed);
} else {
trace!("Replica heartbeat failed, but below threshold");
}
}
ReplicaState::Promoting
| ReplicaState::PromotionFailed
| ReplicaState::Leader
| ReplicaState::Demoting
| ReplicaState::Failed => {
trace!("Replica heartbeat failed in state: {}", self.state.name());
}
}
}
fn state_name(&self) -> &'static str {
self.state.name()
}
fn consecutive_successes(&self) -> usize {
self.consecutive_successes
}
fn consecutive_failures(&self) -> usize {
self.consecutive_failures
}
}

View File

@@ -0,0 +1,12 @@
[package]
name = "harmony_execution"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
[dependencies]
thiserror.workspace = true
lazy_static.workspace = true
directories.workspace = true
log.workspace = true

View File

@@ -0,0 +1,470 @@
use std::io::{BufRead, BufReader};
use std::process::{Child, Command, Stdio};
use std::sync::Arc;
use std::thread;
/// Captured output from a command execution
#[derive(Debug, Clone)]
pub struct CommandOutput {
/// Captured stdout content
pub stdout: String,
/// Captured stderr content
pub stderr: String,
/// Exit status of the command
pub status: CommandStatus,
}
impl CommandOutput {
/// Returns true if the command succeeded
pub fn is_success(&self) -> bool {
self.status.is_success()
}
/// Formats the complete output for display
pub fn format_output(&self) -> String {
format!(
"Stdout:\n{}\n\nStderr:\n{}",
if self.stdout.is_empty() {
"<empty>"
} else {
&self.stdout
},
if self.stderr.is_empty() {
"<empty>"
} else {
&self.stderr
}
)
}
}
/// Result status of a command execution
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CommandStatus {
/// Command executed successfully (exit code 0)
Success,
/// Command failed with an exit code
Failed(i32),
/// Command was terminated by a signal
Terminated(i32),
/// Command execution could not be started
Error(String),
}
impl CommandStatus {
pub fn is_success(&self) -> bool {
matches!(self, CommandStatus::Success)
}
}
impl From<std::process::ExitStatus> for CommandStatus {
fn from(status: std::process::ExitStatus) -> Self {
if status.success() {
CommandStatus::Success
} else if let Some(code) = status.code() {
CommandStatus::Failed(code)
} else {
CommandStatus::Terminated(0) // Signal codes are platform-specific
}
}
}
type Callback = Arc<dyn Fn(&str) + Send + Sync>;
/// Options for configuring command execution
#[derive(Clone)]
pub struct RunnerOptions {
/// Whether to print stdout to console in real-time
pub print_stdout: bool,
/// Whether to print stderr to console in real-time
pub print_stderr: bool,
/// Optional callback for each stdout line
pub stdout_callback: Callback,
/// Optional callback for each stderr line
pub stderr_callback: Callback,
}
impl RunnerOptions {
fn empty_callback() -> Callback {
Arc::new(|_| {})
}
/// Create default options with real-time printing enabled
pub fn print_to_console() -> Self {
Self {
print_stdout: true,
print_stderr: true,
..Default::default()
}
}
/// Create options that capture output silently
pub fn silent() -> Self {
Self {
print_stdout: false,
print_stderr: false,
..Default::default()
}
}
/// Set custom callbacks for stdout and stderr lines
pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
where
F1: Fn(&str) + Send + Sync + 'static,
F2: Fn(&str) + Send + Sync + 'static,
{
self.stdout_callback = Arc::new(stdout_callback);
self.stderr_callback = Arc::new(stderr_callback);
self
}
}
impl Default for RunnerOptions {
fn default() -> Self {
Self {
print_stdout: true,
print_stderr: true,
stdout_callback: Self::empty_callback(),
stderr_callback: Self::empty_callback(),
}
}
}
/// Error type for command execution failures
#[derive(Debug)]
pub struct CommandError {
/// Human-readable error description
pub message: String,
/// Captured output if execution started
pub output: Option<CommandOutput>,
}
impl std::fmt::Display for CommandError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.message)?;
if let Some(output) = &self.output {
write!(f, "\n{}", output.format_output())?;
}
Ok(())
}
}
impl std::error::Error for CommandError {}
/// Runs a command and captures its output while streaming to console
///
/// # Example
///
/// ```
/// use harmony_execution::command::{run_command, RunnerOptions};
/// use std::process::Command;
///
/// let output = run_command(
/// Command::new("echo").arg("hello"),
/// RunnerOptions::print_to_console()
/// ).unwrap();
/// assert!(output.is_success());
/// assert_eq!(output.stdout, "hello\n");
/// ```
pub fn run_command(
command: &mut Command,
options: RunnerOptions,
) -> Result<CommandOutput, CommandError> {
let mut child = command
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| CommandError {
message: format!("Failed to spawn command: {}", e),
output: None,
})?;
let stdout = child.stdout.take().ok_or_else(|| CommandError {
message: "Failed to capture stdout".to_string(),
output: None,
})?;
let stderr = child.stderr.take().ok_or_else(|| CommandError {
message: "Failed to capture stderr".to_string(),
output: None,
})?;
let stdout_reader = BufReader::new(stdout);
let stderr_reader = BufReader::new(stderr);
let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
// Spawn thread to handle stdout
let stdout_handle = thread::spawn(move || {
let mut output = String::new();
for line in stdout_reader.lines() {
match line {
Ok(line_content) => {
if options.print_stdout {
println!("{}", line_content);
}
(options.stdout_callback)(&line_content);
output.push_str(&line_content);
output.push('\n');
}
Err(e) => {
// Silently handle read errors - corrupted data at end is common
log::trace!("Error reading stdout line: {}", e);
}
}
}
let _ = stdout_sender.send(output);
});
// Spawn thread to handle stderr
let stderr_handle = thread::spawn(move || {
let mut output = String::new();
for line in stderr_reader.lines() {
match line {
Ok(line_content) => {
if options.print_stderr {
eprintln!("{}", line_content);
}
(options.stderr_callback)(&line_content);
output.push_str(&line_content);
output.push('\n');
}
Err(e) => {
log::trace!("Error reading stderr line: {}", e);
}
}
}
let _ = stderr_sender.send(output);
});
let status = child.wait().map_err(|e| CommandError {
message: format!("Failed to wait for command process: {}", e),
output: None,
})?;
let stdout_lines = stdout_handle
.join()
.map_err(|e| CommandError {
message: format!("Stdout thread panicked: {:?}", e),
output: None,
})
.and_then(|_| {
stdout_receiver.recv().map_err(|e| CommandError {
message: format!("Failed to receive stdout: {}", e),
output: None,
})
})?;
let stderr_lines = stderr_handle
.join()
.map_err(|e| CommandError {
message: format!("Stderr thread panicked: {:?}", e),
output: None,
})
.and_then(|_| {
stderr_receiver.recv().map_err(|e| CommandError {
message: format!("Failed to receive stderr: {}", e),
output: None,
})
})?;
Ok(CommandOutput {
stdout: stdout_lines,
stderr: stderr_lines,
status: status.into(),
})
}
/// Convenience function to run a command with default options (print to console)
pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
run_command(command, RunnerOptions::print_to_console())
}
/// Convenience function to run a command silently (capture output only)
pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
run_command(command, RunnerOptions::silent())
}
#[cfg(test)]
mod tests {
use super::*;
use std::process::Command;
#[test]
fn test_simple_echo_command() {
let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
assert!(output.is_success());
assert_eq!(output.stdout.trim(), "hello world");
assert!(output.stderr.is_empty());
}
#[test]
fn test_command_failure() {
let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
assert!(!output.is_success());
assert_eq!(output.status, CommandStatus::Failed(42));
}
#[test]
fn test_command_output_format() {
let output = run_silent(Command::new("echo").arg("test")).unwrap();
let formatted = output.format_output();
assert!(formatted.contains("Stdout:"));
assert!(formatted.contains("test"));
}
#[test]
fn test_runner_options() {
let opts = RunnerOptions::print_to_console();
assert!(opts.print_stdout);
assert!(opts.print_stderr);
let opts = RunnerOptions::silent();
assert!(!opts.print_stdout);
assert!(!opts.print_stderr);
}
#[test]
fn test_command_status_from_exit_status() {
let output = run_silent(&mut Command::new("true")).unwrap();
assert_eq!(output.status, CommandStatus::Success);
let output = run_silent(&mut Command::new("false")).unwrap();
assert_eq!(output.status, CommandStatus::Failed(1));
}
#[test]
fn test_stdout_callback_receives_lines() {
use std::sync::{Arc, Mutex};
let captured = Arc::new(Mutex::new(Vec::new()));
let captured_clone = Arc::clone(&captured);
let opts = RunnerOptions::silent().with_callbacks(
move |line| captured_clone.lock().unwrap().push(line.to_string()),
|_| {},
);
run_command(Command::new("echo").arg("hello world"), opts).unwrap();
let lines = captured.lock().unwrap();
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], "hello world");
}
#[test]
fn test_stderr_callback_receives_lines() {
use std::sync::{Arc, Mutex};
let captured = Arc::new(Mutex::new(Vec::new()));
let captured_clone = Arc::clone(&captured);
let opts = RunnerOptions::silent().with_callbacks(
|_| {},
move |line| captured_clone.lock().unwrap().push(line.to_string()),
);
run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
let lines = captured.lock().unwrap();
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], "error");
}
#[test]
fn test_callback_and_capture_both_work() {
use std::sync::{Arc, Mutex};
let callback_lines = Arc::new(Mutex::new(Vec::new()));
let callback_clone = Arc::clone(&callback_lines);
let opts = RunnerOptions::silent().with_callbacks(
move |line| callback_clone.lock().unwrap().push(line.to_string()),
|_| {},
);
let output =
run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
// Verify captured output
assert_eq!(output.stdout, "line1\nline2\nline3\n");
// Verify callback received all lines
let lines = callback_lines.lock().unwrap();
assert_eq!(lines.len(), 3);
assert_eq!(lines[0], "line1");
assert_eq!(lines[1], "line2");
assert_eq!(lines[2], "line3");
}
#[test]
fn test_multiline_output_capture() {
let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
assert_eq!(output.stdout, "line1\nline2\nline3\n");
assert!(output.stderr.trim().is_empty());
}
#[test]
fn test_mixed_stdout_stderr_capture() {
let output = run_silent(Command::new("sh").args([
"-c",
"echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
]))
.unwrap();
assert!(output.stdout.contains("stdout1"));
assert!(output.stdout.contains("stdout2"));
assert!(output.stderr.contains("stderr1"));
assert!(output.stderr.contains("stderr2"));
}
#[test]
fn test_empty_output_command() {
let output = run_silent(&mut Command::new("true")).unwrap();
assert!(output.stdout.is_empty());
assert!(output.stderr.is_empty());
assert!(output.is_success());
}
#[test]
fn test_command_output_format_with_empty_streams() {
let output = run_silent(&mut Command::new("true")).unwrap();
let formatted = output.format_output();
assert!(formatted.contains("Stdout:"));
assert!(formatted.contains("<empty>"));
assert!(formatted.contains("Stderr:"));
}
#[test]
fn test_error_contains_message_and_output() {
let error = CommandError {
message: "Test error".to_string(),
output: Some(CommandOutput {
stdout: "captured stdout".to_string(),
stderr: "captured stderr".to_string(),
status: CommandStatus::Success,
}),
};
let display = format!("{}", error);
assert!(display.contains("Test error"));
assert!(display.contains("captured stdout"));
assert!(display.contains("captured stderr"));
}
#[test]
fn test_error_without_output() {
let error = CommandError {
message: "Spawn failed".to_string(),
output: None,
};
let display = format!("{}", error);
assert!(display.contains("Spawn failed"));
assert!(!display.contains("Stdout:"));
assert!(!display.contains("Stderr:"));
}
}

View File

@@ -0,0 +1,5 @@
pub mod command;
pub use command::{
CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
};

View File

@@ -32,6 +32,14 @@ impl Id {
} }
} }
impl Into<Id> for &str {
fn into(self) -> Id {
Id {
value: self.to_string(),
}
}
}
impl FromStr for Id { impl FromStr for Id {
type Err = (); type Err = ();