feat/harmony_agent #220
@@ -1,2 +1,6 @@
|
||||
target/
|
||||
Dockerfile
|
||||
.git
|
||||
data
|
||||
target
|
||||
demos
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -24,3 +24,5 @@ Cargo.lock
|
||||
|
||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||
*.pdb
|
||||
|
||||
.harmony_generated
|
||||
|
||||
218
Cargo.lock
generated
218
Cargo.lock
generated
@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"const-random",
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
@@ -450,6 +450,43 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-nats"
|
||||
version = "0.45.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"memchr",
|
||||
"nkeys",
|
||||
"nuid",
|
||||
"once_cell",
|
||||
"pin-project",
|
||||
"portable-atomic",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"ring",
|
||||
"rustls-native-certs 0.7.3",
|
||||
"rustls-pemfile 2.2.0",
|
||||
"rustls-webpki 0.102.8",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_nanos",
|
||||
"serde_repr",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.2",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tokio-websockets",
|
||||
"tracing",
|
||||
"tryhard",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.6"
|
||||
@@ -775,6 +812,9 @@ name = "bytes"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bytestring"
|
||||
@@ -1583,6 +1623,7 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
"serde",
|
||||
"sha2",
|
||||
"signature",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -2456,21 +2497,21 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"wasi",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.3+wasi-0.2.4",
|
||||
"wasip2",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
@@ -2572,6 +2613,7 @@ dependencies = [
|
||||
"env_logger",
|
||||
"fqdn",
|
||||
"futures-util",
|
||||
"harmony_execution",
|
||||
"harmony_inventory_agent",
|
||||
"harmony_macros",
|
||||
"harmony_secret",
|
||||
@@ -2619,6 +2661,43 @@ dependencies = [
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_agent"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-nats",
|
||||
"async-trait",
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"getrandom 0.3.4",
|
||||
"harmony",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"pretty_assertions",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.16",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_agent_deploy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cidr",
|
||||
"env_logger",
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"harmony_macros",
|
||||
"harmony_types",
|
||||
"log",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_cli"
|
||||
version = "0.1.0"
|
||||
@@ -2659,6 +2738,16 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_execution"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"directories",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"thiserror 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "harmony_inventory_agent"
|
||||
version = "0.1.0"
|
||||
@@ -3523,7 +3612,7 @@ version = "0.1.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
"libc",
|
||||
]
|
||||
|
||||
@@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"wasi",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"wasi",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
@@ -4022,6 +4111,21 @@ dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nkeys"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
|
||||
dependencies = [
|
||||
"data-encoding",
|
||||
"ed25519",
|
||||
"ed25519-dalek",
|
||||
"getrandom 0.2.16",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"signatory",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "non-blank-string-rs"
|
||||
version = "1.0.4"
|
||||
@@ -4040,6 +4144,15 @@ dependencies = [
|
||||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nuid"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.6"
|
||||
@@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
"lru-slab",
|
||||
"rand 0.9.2",
|
||||
"ring",
|
||||
@@ -4765,7 +4878,7 @@ version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5301,6 +5414,16 @@ dependencies = [
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.102.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.103.4"
|
||||
@@ -5564,6 +5687,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_nanos"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_path_to_error"
|
||||
version = "0.1.17"
|
||||
@@ -5731,6 +5863,18 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signatory"
|
||||
version = "0.27.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
|
||||
dependencies = [
|
||||
"pkcs8",
|
||||
"rand_core 0.6.4",
|
||||
"signature",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signature"
|
||||
version = "2.2.0"
|
||||
@@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
"once_cell",
|
||||
"rustix 1.0.8",
|
||||
"windows-sys 0.60.2",
|
||||
@@ -6538,6 +6682,27 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-websockets"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http 1.3.1",
|
||||
"httparse",
|
||||
"rand 0.8.5",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.2",
|
||||
"tokio-util",
|
||||
"webpki-roots 0.26.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.23"
|
||||
@@ -6689,6 +6854,16 @@ version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "tryhard"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tui-logger"
|
||||
version = "0.14.5"
|
||||
@@ -6865,7 +7040,7 @@ version = "1.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
"getrandom 0.3.4",
|
||||
"js-sys",
|
||||
"rand 0.9.2",
|
||||
"uuid-macro-internal",
|
||||
@@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.3+wasi-0.2.4"
|
||||
name = "wasip2"
|
||||
version = "1.0.2+wasi-0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
|
||||
checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
|
||||
dependencies = [
|
||||
"wit-bindgen",
|
||||
]
|
||||
@@ -7061,6 +7236,15 @@ version = "0.25.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.26.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
|
||||
dependencies = [
|
||||
"webpki-roots 1.0.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "1.0.2"
|
||||
@@ -7438,9 +7622,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen"
|
||||
version = "0.45.0"
|
||||
version = "0.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
|
||||
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
|
||||
@@ -7,6 +7,7 @@ members = [
|
||||
"harmony_types",
|
||||
"harmony_macros",
|
||||
"harmony_tui",
|
||||
"harmony_execution",
|
||||
"opnsense-config",
|
||||
"opnsense-config-xml",
|
||||
"harmony_cli",
|
||||
@@ -17,6 +18,8 @@ members = [
|
||||
"harmony_secret",
|
||||
"adr/agent_discovery/mdns",
|
||||
"brocade",
|
||||
"harmony_agent",
|
||||
"harmony_agent/deploy",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
|
||||
|
||||
In other words, Harmony is a **next-generation platform engineering framework**.
|
||||
|
||||
_By [NationTech](https://nationtech.io)_
|
||||
|
||||
[](https://git.nationtech.io/nationtech/harmony)
|
||||
|
||||
141
adr/018-Template-Hydration-For-Workload-Deployment.md
Normal file
141
adr/018-Template-Hydration-For-Workload-Deployment.md
Normal file
@@ -0,0 +1,141 @@
|
||||
# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
|
||||
|
||||
Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
|
||||
|
||||
Initial Date: 2025-01-23
|
||||
|
||||
Last Updated Date: 2025-01-23
|
||||
|
||||
## Status
|
||||
|
||||
Implemented
|
||||
|
||||
## Context
|
||||
|
||||
Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
|
||||
|
||||
After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
|
||||
|
||||
* **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
|
||||
* **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
|
||||
* **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
|
||||
* **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
|
||||
|
||||
We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
|
||||
|
||||
## Decision
|
||||
|
||||
We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
|
||||
|
||||
Specifically:
|
||||
|
||||
* **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
|
||||
* **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
|
||||
* **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
|
||||
* **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
|
||||
|
||||
The implementation in `backend_app.rs` demonstrates this pattern:
|
||||
|
||||
```rust
|
||||
let deployment = Deployment {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.name.clone()),
|
||||
labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(DeploymentSpec { /* ... */ }),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let deployment_yaml = serde_yaml::to_string(&deployment)?;
|
||||
fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
|
||||
```
|
||||
|
||||
## Rationale
|
||||
|
||||
**Aligns with "Infrastructure as Resilient Code"**
|
||||
|
||||
Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
|
||||
|
||||
* **Refactorability:** Rename a label and the compiler catches all usages.
|
||||
* **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
|
||||
* **Code Navigation:** Jump to definition shows exactly where a value comes from.
|
||||
|
||||
**Achieves "Prove It Works — Before You Deploy"**
|
||||
|
||||
The compiler now validates that:
|
||||
|
||||
* All required fields are populated (Rust's `Option` type prevents missing fields).
|
||||
* Field types match expectations (ports are integers, not strings).
|
||||
* Enums contain valid values (e.g., `ServiceType::ClusterIP`).
|
||||
|
||||
This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
|
||||
|
||||
**Enables True Unit Testing**
|
||||
|
||||
Developers can now write unit tests that assert directly against typed objects:
|
||||
|
||||
```rust
|
||||
let deployment = create_deployment(&app);
|
||||
assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
|
||||
assert_eq!(deployment.metadata.name.unwrap(), "my-app");
|
||||
```
|
||||
|
||||
No string parsing, no YAML serialization, no fragile assertions against rendered output.
|
||||
|
||||
**Preserves Ecosystem Benefits**
|
||||
|
||||
By generating standard Helm chart structures, Harmony retains compatibility with:
|
||||
|
||||
* **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
|
||||
* **ArgoCD:** Syncs and manages releases using the generated charts.
|
||||
* **Existing Workflows:** Teams already consuming Helm charts see no change.
|
||||
|
||||
The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
* **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
|
||||
* **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
|
||||
* **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
|
||||
* **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
|
||||
* **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
|
||||
|
||||
### Negative
|
||||
|
||||
* **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
|
||||
* **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
|
||||
* **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
|
||||
* **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### 1. Enhance Askama with Compile-Time Validation
|
||||
*Pros:* Stay within familiar templating paradigm; minimal code changes.
|
||||
*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
|
||||
|
||||
### 2. Use Helm SDK Programmatically (Go)
|
||||
*Pros:* Direct access to Helm's template engine; no YAML serialization step.
|
||||
*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
|
||||
|
||||
### 3. Raw YAML String Templating (Manual)
|
||||
*Pros:* Maximum control; no external dependencies.
|
||||
*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
|
||||
|
||||
### 4. Use Kustomize for All Manifests
|
||||
*Pros:* Declarative overlays; standard tool.
|
||||
*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
|
||||
|
||||
__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
|
||||
|
||||
## Additional Notes
|
||||
|
||||
**Scalability to Future Topologies**
|
||||
|
||||
The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
|
||||
|
||||
**Implementation Status**
|
||||
|
||||
As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::net::{IpAddr, Ipv4Addr};
|
||||
|
||||
use brocade::{BrocadeOptions, ssh};
|
||||
use harmony_secret::{Secret, SecretManager};
|
||||
use harmony_secret::Secret;
|
||||
use harmony_types::switch::PortLocation;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -56,6 +56,8 @@ async fn main() {
|
||||
)),
|
||||
};
|
||||
|
||||
// TODO exec pod commands to initialize secret store if not already done
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(),
|
||||
|
||||
@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
|
||||
opnsense-config-xml = { path = "../opnsense-config-xml" }
|
||||
harmony_macros = { path = "../harmony_macros" }
|
||||
harmony_types = { path = "../harmony_types" }
|
||||
harmony_execution = { path = "../harmony_execution" }
|
||||
uuid.workspace = true
|
||||
url.workspace = true
|
||||
kube = { workspace = true, features = ["derive"] }
|
||||
|
||||
801
harmony/src/modules/application/backend_app.rs
Normal file
801
harmony/src/modules/application/backend_app.rs
Normal file
@@ -0,0 +1,801 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, info, trace};
|
||||
use serde::Serialize;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{
|
||||
config::{REGISTRY_PROJECT, REGISTRY_URL},
|
||||
modules::application::{
|
||||
Application, HelmPackage, OCICompliant,
|
||||
config::ApplicationNetworkPort,
|
||||
helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
|
||||
},
|
||||
};
|
||||
use harmony_execution::{RunnerOptions, run_command};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct BuildCommand {
|
||||
pub program: String,
|
||||
pub args: Vec<String>,
|
||||
}
|
||||
|
||||
impl BuildCommand {
|
||||
pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
|
||||
Self {
|
||||
program: program.into(),
|
||||
args: args.into_iter().map(|s| s.into()).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_std_command(&self) -> std::process::Command {
|
||||
let mut cmd = std::process::Command::new(&self.program);
|
||||
cmd.args(&self.args);
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct BackendApp {
|
||||
pub name: String,
|
||||
pub project_root: std::path::PathBuf,
|
||||
pub network_ports: Vec<ApplicationNetworkPort>,
|
||||
pub env_vars: Vec<(String, String)>,
|
||||
pub build_cmd: BuildCommand,
|
||||
pub dockerfile: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl BackendApp {
|
||||
fn get_dockerfile(&self) -> Result<PathBuf, String> {
|
||||
debug!(
|
||||
"Looking for dockerfile, currently set to {:?}",
|
||||
self.dockerfile
|
||||
);
|
||||
if let Some(dockerfile) = &self.dockerfile {
|
||||
return match dockerfile.exists() {
|
||||
true => {
|
||||
info!(
|
||||
"Found dockerfile as intended at {}",
|
||||
dockerfile.to_string_lossy()
|
||||
);
|
||||
Ok(dockerfile.clone())
|
||||
}
|
||||
false => Err(format!(
|
||||
"Dockerfile explicitely set to {dockerfile} does not exist",
|
||||
dockerfile = dockerfile.to_string_lossy()
|
||||
)),
|
||||
};
|
||||
}
|
||||
|
||||
let existing_dockerfile = self.project_root.join("Dockerfile");
|
||||
|
||||
debug!("project_root = {:?}", self.project_root);
|
||||
|
||||
debug!("checking = {:?}", existing_dockerfile);
|
||||
if existing_dockerfile.exists() {
|
||||
debug!(
|
||||
"Checking path {:#?} for existing Dockerfile",
|
||||
self.project_root.clone()
|
||||
);
|
||||
return Ok(existing_dockerfile);
|
||||
}
|
||||
Err(format!(
|
||||
"Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
|
||||
project_root = self.project_root.to_string_lossy(),
|
||||
existing_dockerfile = existing_dockerfile.to_string_lossy(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl Application for BackendApp {
|
||||
fn name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl OCICompliant for BackendApp {
|
||||
async fn build_push_oci_image(&self) -> Result<String, String> {
|
||||
let dockerfile = self.get_dockerfile()?;
|
||||
let image_tag = self.image_name();
|
||||
|
||||
// Run docker build command, streaming output to console and capturing it
|
||||
let output = run_command(
|
||||
std::process::Command::new("docker").args([
|
||||
"build",
|
||||
"-t",
|
||||
&image_tag,
|
||||
"-f",
|
||||
&dockerfile.to_string_lossy(),
|
||||
&self.project_root.to_string_lossy(),
|
||||
]),
|
||||
RunnerOptions::print_to_console(),
|
||||
)
|
||||
.map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
|
||||
|
||||
if output.is_success() {
|
||||
info!("Docker image build succeeded");
|
||||
Ok(image_tag)
|
||||
} else {
|
||||
Err(format!(
|
||||
"Docker image build FAILED:\n{}",
|
||||
output.format_output()
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn local_image_name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
|
||||
fn image_name(&self) -> String {
|
||||
format!(
|
||||
"{}/{}/{}",
|
||||
*REGISTRY_URL,
|
||||
*REGISTRY_PROJECT,
|
||||
&self.local_image_name()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl HelmPackage for BackendApp {
|
||||
fn project_root(&self) -> PathBuf {
|
||||
self.project_root.clone()
|
||||
}
|
||||
|
||||
fn chart_name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
|
||||
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
|
||||
let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
|
||||
|
||||
// Build the typed Deployment object using the builder with initial options
|
||||
helm_chart.add_resource(HelmResourceKind::Deployment(
|
||||
DeploymentBuilder::with_options(
|
||||
&self.name,
|
||||
image_url,
|
||||
Some(self.network_ports.clone()),
|
||||
Some(self.env_vars.clone()),
|
||||
None,
|
||||
)
|
||||
.build(),
|
||||
));
|
||||
|
||||
// Build the typed Service object using the helper function
|
||||
if let Some(service) =
|
||||
helm::create_service_from_ports(self.name.clone(), &self.network_ports)
|
||||
{
|
||||
helm_chart.add_resource(HelmResourceKind::Service(service));
|
||||
}
|
||||
|
||||
// Write the Helm chart metadata to the project root
|
||||
let chart_dir = helm_chart
|
||||
.write_to(&self.project_root.join(".harmony_generated/helm/"))
|
||||
.map_err(|e| format!("Failed to write Helm chart: {}", e))?;
|
||||
|
||||
info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
|
||||
|
||||
Ok(chart_dir.to_string_lossy().to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::modules::application::config::ApplicationNetworkPort;
|
||||
use crate::modules::application::config::NetworkProtocol;
|
||||
use k8s_openapi::api::apps::v1::Deployment;
|
||||
use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
|
||||
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||
use serde_yaml::from_str;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use tempfile::tempdir;
|
||||
|
||||
// Test Helpers
|
||||
fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
|
||||
let path = project_root.join(format!(
|
||||
".harmony_generated/helm/{chart_name}/templates/service.yaml"
|
||||
));
|
||||
let content = fs::read_to_string(&path)
|
||||
.unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
|
||||
from_str(&content)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
|
||||
}
|
||||
|
||||
fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
|
||||
let path = project_root.join(format!(
|
||||
".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
|
||||
));
|
||||
let content = fs::read_to_string(&path)
|
||||
.unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
|
||||
from_str(&content)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
|
||||
}
|
||||
|
||||
fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
|
||||
let path = project_root.join(format!(
|
||||
".harmony_generated/helm/{chart_name}/templates/service.yaml"
|
||||
));
|
||||
path.exists()
|
||||
}
|
||||
|
||||
// Service Assertions
|
||||
fn assert_service_metadata(service: &K8sService, expected_name: &str) {
|
||||
assert_eq!(
|
||||
service.metadata.name.as_deref(),
|
||||
Some(expected_name),
|
||||
"Service name should be '{expected_name}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_service_type(service: &K8sService, expected_type: &str) {
|
||||
assert_eq!(
|
||||
service.spec.as_ref().and_then(|s| s.type_.as_deref()),
|
||||
Some(expected_type),
|
||||
"Service type should be '{expected_type}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_service_port_count(service: &K8sService, expected_count: usize) {
|
||||
let ports = service
|
||||
.spec
|
||||
.as_ref()
|
||||
.and_then(|s| s.ports.as_ref())
|
||||
.unwrap_or_else(|| panic!("Service should have ports"));
|
||||
assert_eq!(
|
||||
ports.len(),
|
||||
expected_count,
|
||||
"Service should have {expected_count} ports"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_service_port(
|
||||
port: &ServicePort,
|
||||
expected_name: &str,
|
||||
expected_protocol: &str,
|
||||
expected_number: i32,
|
||||
) {
|
||||
assert_eq!(
|
||||
port.name.as_deref(),
|
||||
Some(expected_name),
|
||||
"Port name should be '{expected_name}'"
|
||||
);
|
||||
assert_eq!(
|
||||
port.protocol.as_deref(),
|
||||
Some(expected_protocol),
|
||||
"Port '{expected_name}' protocol should be '{expected_protocol}'"
|
||||
);
|
||||
assert_eq!(
|
||||
port.port, expected_number,
|
||||
"Port '{expected_name}' number should be {expected_number}"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_target_port_matches_service_port(port: &ServicePort) {
|
||||
match &port.target_port {
|
||||
Some(IntOrString::Int(target)) => {
|
||||
assert_eq!(
|
||||
*target,
|
||||
port.port,
|
||||
"Target port should match service port for '{}'",
|
||||
port.name.as_deref().unwrap_or("unknown")
|
||||
);
|
||||
}
|
||||
_ => panic!(
|
||||
"Target port should be Int for '{}'",
|
||||
port.name.as_deref().unwrap_or("unknown")
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// Deployment Assertions
|
||||
fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
|
||||
assert_eq!(
|
||||
deployment.metadata.name.as_deref(),
|
||||
Some(expected_name),
|
||||
"Deployment name should be '{expected_name}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
|
||||
let spec = deployment
|
||||
.spec
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||
assert_eq!(
|
||||
spec.replicas,
|
||||
Some(expected_replicas),
|
||||
"Deployment should have {expected_replicas} replicas"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
|
||||
let spec = deployment
|
||||
.spec
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||
assert_eq!(
|
||||
spec.selector
|
||||
.match_labels
|
||||
.as_ref()
|
||||
.and_then(|m| m.get("app.kubernetes.io/name")),
|
||||
Some(&expected_label_value.to_string()),
|
||||
"Selector should match app name '{expected_label_value}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
|
||||
let spec = deployment
|
||||
.spec
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||
let metadata = spec
|
||||
.template
|
||||
.metadata
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Pod template should have metadata"));
|
||||
let labels = metadata
|
||||
.labels
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Pod should have labels"));
|
||||
|
||||
assert_eq!(
|
||||
labels.get("app.kubernetes.io/name"),
|
||||
Some(&expected_name.to_string()),
|
||||
"Pod label app.kubernetes.io/name should be '{expected_name}'"
|
||||
);
|
||||
assert_eq!(
|
||||
labels.get("app.kubernetes.io/instance"),
|
||||
Some(&expected_name.to_string()),
|
||||
"Pod label app.kubernetes.io/instance should be '{expected_name}'"
|
||||
);
|
||||
}
|
||||
|
||||
// Container Assertions
|
||||
fn assert_container_metadata(
|
||||
container: &Container,
|
||||
expected_name: &str,
|
||||
expected_image: &str,
|
||||
expected_pull_policy: &str,
|
||||
) {
|
||||
assert_eq!(
|
||||
container.name, expected_name,
|
||||
"Container name should be '{expected_name}'"
|
||||
);
|
||||
assert_eq!(
|
||||
container.image.as_deref(),
|
||||
Some(expected_image),
|
||||
"Container image should be '{expected_image}'"
|
||||
);
|
||||
assert_eq!(
|
||||
container.image_pull_policy.as_deref(),
|
||||
Some(expected_pull_policy),
|
||||
"Image pull policy should be '{expected_pull_policy}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_container_ports_count(container: &Container, expected_count: usize) {
|
||||
let ports = container
|
||||
.ports
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Container should have ports"));
|
||||
assert_eq!(
|
||||
ports.len(),
|
||||
expected_count,
|
||||
"Container should have {expected_count} ports"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_container_port(
|
||||
port: &k8s_openapi::api::core::v1::ContainerPort,
|
||||
expected_name: &str,
|
||||
expected_protocol: &str,
|
||||
expected_number: i32,
|
||||
) {
|
||||
assert_eq!(
|
||||
port.name.as_deref(),
|
||||
Some(expected_name),
|
||||
"Container port name should be '{expected_name}'"
|
||||
);
|
||||
assert_eq!(
|
||||
port.protocol.as_deref(),
|
||||
Some(expected_protocol),
|
||||
"Container port '{expected_name}' protocol should be '{expected_protocol}'"
|
||||
);
|
||||
assert_eq!(
|
||||
port.container_port, expected_number,
|
||||
"Container port '{expected_name}' number should be {expected_number}"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
|
||||
let env_vars = container
|
||||
.env
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Container should have env vars"));
|
||||
assert_eq!(
|
||||
env_vars.len(),
|
||||
expected_count,
|
||||
"Container should have {expected_count} env vars"
|
||||
);
|
||||
}
|
||||
|
||||
fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
|
||||
assert_eq!(
|
||||
env_var.name, expected_name,
|
||||
"Env var name should be '{expected_name}'"
|
||||
);
|
||||
assert_eq!(
|
||||
env_var.value.as_deref(),
|
||||
Some(expected_value),
|
||||
"Env var '{expected_name}' value should be '{expected_value}'"
|
||||
);
|
||||
}
|
||||
|
||||
fn get_container(deployment: &Deployment) -> Container {
|
||||
let spec = deployment
|
||||
.spec
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||
let pod_spec = spec
|
||||
.template
|
||||
.spec
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| panic!("Pod template should have spec"));
|
||||
pod_spec
|
||||
.containers
|
||||
.first()
|
||||
.unwrap_or_else(|| panic!("Should have exactly one container"))
|
||||
.clone()
|
||||
}
|
||||
|
||||
// Test Fixtures
|
||||
fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
|
||||
vec![
|
||||
ApplicationNetworkPort {
|
||||
number: 8080,
|
||||
protocol: NetworkProtocol::TCP,
|
||||
name: "http".to_string(),
|
||||
},
|
||||
ApplicationNetworkPort {
|
||||
number: 9000,
|
||||
protocol: NetworkProtocol::TCP,
|
||||
name: "metrics".to_string(),
|
||||
},
|
||||
ApplicationNetworkPort {
|
||||
number: 50051,
|
||||
protocol: NetworkProtocol::TCP,
|
||||
name: "grpc".to_string(),
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
fn standard_test_env_vars() -> Vec<(String, String)> {
|
||||
vec![
|
||||
("ENV_VAR_1".to_string(), "value1".to_string()),
|
||||
("ENV_VAR_2".to_string(), "value2".to_string()),
|
||||
]
|
||||
}
|
||||
|
||||
fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
|
||||
vec![
|
||||
ApplicationNetworkPort {
|
||||
number: 53,
|
||||
protocol: NetworkProtocol::UDP,
|
||||
name: "dns".to_string(),
|
||||
},
|
||||
ApplicationNetworkPort {
|
||||
number: 8080,
|
||||
protocol: NetworkProtocol::TCP,
|
||||
name: "http".to_string(),
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
// Test Builder
|
||||
struct BackendAppTestBuilder {
|
||||
name: Option<String>,
|
||||
network_ports: Option<Vec<ApplicationNetworkPort>>,
|
||||
env_vars: Option<Vec<(String, String)>>,
|
||||
}
|
||||
|
||||
impl BackendAppTestBuilder {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
name: None,
|
||||
network_ports: None,
|
||||
env_vars: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn with_name(mut self, name: impl Into<String>) -> Self {
|
||||
self.name = Some(name.into());
|
||||
self
|
||||
}
|
||||
|
||||
fn with_standard_ports(mut self) -> Self {
|
||||
self.network_ports = Some(standard_test_ports());
|
||||
self
|
||||
}
|
||||
|
||||
fn with_udp_ports(mut self) -> Self {
|
||||
self.network_ports = Some(udp_test_ports());
|
||||
self
|
||||
}
|
||||
|
||||
fn with_standard_env_vars(mut self) -> Self {
|
||||
self.env_vars = Some(standard_test_env_vars());
|
||||
self
|
||||
}
|
||||
|
||||
fn with_no_ports(mut self) -> Self {
|
||||
self.network_ports = Some(vec![]);
|
||||
self
|
||||
}
|
||||
|
||||
fn build(self, project_root: PathBuf) -> BackendApp {
|
||||
BackendApp {
|
||||
name: self.name.unwrap_or_else(|| "test-app".to_string()),
|
||||
project_root,
|
||||
network_ports: self.network_ports.unwrap_or_default(),
|
||||
env_vars: self.env_vars.unwrap_or_default(),
|
||||
build_cmd: BuildCommand::new("cargo", vec!["build"]),
|
||||
dockerfile: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BackendAppTestBuilder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function for test setup
|
||||
async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
|
||||
let result = app.build_push_helm_package(image_url).await;
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"build_push_helm_package should succeed: {:?}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
// ===== SERVICE TESTS =====
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_is_created_with_application_name() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let service = read_service_yaml(&app.project_root, "test-app");
|
||||
assert_service_metadata(&service, "test-app");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_has_default_clusterip_type() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let service = read_service_yaml(&app.project_root, "test-app");
|
||||
assert_service_type(&service, "ClusterIP");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_exposes_all_network_ports() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let service = read_service_yaml(&app.project_root, "test-app");
|
||||
assert_service_port_count(&service, 3);
|
||||
|
||||
let ports = service.spec.unwrap().ports.unwrap();
|
||||
assert_service_port(&ports[0], "http", "TCP", 8080);
|
||||
assert_service_port(&ports[1], "metrics", "TCP", 9000);
|
||||
assert_service_port(&ports[2], "grpc", "TCP", 50051);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_target_ports_match_service_ports() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let service = read_service_yaml(&app.project_root, "test-app");
|
||||
let ports = service.spec.unwrap().ports.unwrap();
|
||||
|
||||
for port in &ports {
|
||||
assert_target_port_matches_service_port(port);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_not_created_when_application_has_no_ports() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app-no-ports")
|
||||
.with_no_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
|
||||
|
||||
assert!(
|
||||
!service_yaml_exists(&app.project_root, "test-app-no-ports"),
|
||||
"service.yaml should not exist when there are no network ports"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn service_respects_port_protocol_type() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("udp-app")
|
||||
.with_udp_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
|
||||
|
||||
let service = read_service_yaml(&app.project_root, "udp-app");
|
||||
let ports = service.spec.unwrap().ports.unwrap();
|
||||
|
||||
assert_service_port(&ports[0], "dns", "UDP", 53);
|
||||
assert_service_port(&ports[1], "http", "TCP", 8080);
|
||||
}
|
||||
|
||||
// ===== DEPLOYMENT METADATA TESTS =====
|
||||
|
||||
#[tokio::test]
|
||||
async fn deployment_has_application_name() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
assert_deployment_metadata(&deployment, "test-app");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn deployment_has_single_replica_by_default() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
assert_deployment_replicas(&deployment, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn deployment_selector_matches_application_name() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
assert_selector_match_label(&deployment, "test-app");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn pod_has_standard_kubernetes_labels() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
assert_pod_labels(&deployment, "test-app");
|
||||
}
|
||||
|
||||
// ===== CONTAINER CONFIGURATION TESTS =====
|
||||
|
||||
#[tokio::test]
|
||||
async fn container_has_correct_name_and_image() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
let image_url = "registry.example.com/test/test-app:1.0.0";
|
||||
build_helm_chart_for_test(&app, image_url).await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
let container = get_container(&deployment);
|
||||
|
||||
assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn container_exposes_all_application_ports() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
let container = get_container(&deployment);
|
||||
|
||||
assert_container_ports_count(&container, 3);
|
||||
|
||||
let ports = container.ports.unwrap();
|
||||
assert_container_port(&ports[0], "http", "TCP", 8080);
|
||||
assert_container_port(&ports[1], "metrics", "TCP", 9000);
|
||||
assert_container_port(&ports[2], "grpc", "TCP", 50051);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn container_has_all_environment_variables() {
|
||||
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||
let app = BackendAppTestBuilder::new()
|
||||
.with_name("test-app")
|
||||
.with_standard_ports()
|
||||
.with_standard_env_vars()
|
||||
.build(temp_dir.path().to_path_buf());
|
||||
|
||||
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||
|
||||
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||
let container = get_container(&deployment);
|
||||
|
||||
assert_container_env_vars_count(&container, 2);
|
||||
|
||||
let env_vars = container.env.unwrap();
|
||||
assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
|
||||
assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
|
||||
}
|
||||
|
||||
// ===== BUILD COMMAND UNIT TESTS =====
|
||||
|
||||
#[test]
|
||||
fn build_command_creation_sets_program_and_args() {
|
||||
let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
|
||||
assert_eq!(cmd.program, "docker");
|
||||
assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_command_clone_copies_all_fields() {
|
||||
let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
|
||||
let cmd2 = cmd1.clone();
|
||||
assert_eq!(cmd1.program, cmd2.program);
|
||||
assert_eq!(cmd1.args, cmd2.args);
|
||||
}
|
||||
}
|
||||
29
harmony/src/modules/application/config.rs
Normal file
29
harmony/src/modules/application/config.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub enum NetworkProtocol {
|
||||
TCP,
|
||||
UDP,
|
||||
}
|
||||
|
||||
impl NetworkProtocol {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
NetworkProtocol::TCP => "TCP",
|
||||
NetworkProtocol::UDP => "UDP",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for NetworkProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ApplicationNetworkPort {
|
||||
pub number: u16,
|
||||
pub protocol: NetworkProtocol,
|
||||
pub name: String,
|
||||
}
|
||||
@@ -48,11 +48,11 @@ use crate::{
|
||||
/// - ArgoCD to install/upgrade/rollback/inspect k8s resources
|
||||
/// - Kubernetes for runtime orchestration
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> {
|
||||
pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
|
||||
pub application: Arc<A>,
|
||||
}
|
||||
|
||||
impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
|
||||
impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
|
||||
async fn deploy_to_local_k3d(
|
||||
&self,
|
||||
app_name: String,
|
||||
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
|
||||
|
||||
#[async_trait]
|
||||
impl<
|
||||
A: OCICompliant + HelmPackage + Webapp + Clone + 'static,
|
||||
A: OCICompliant + HelmPackage + Clone + 'static,
|
||||
T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
|
||||
> ApplicationFeature<T> for PackagingDeployment<A>
|
||||
{
|
||||
@@ -148,24 +148,12 @@ impl<
|
||||
) -> Result<InstallationOutcome, InstallationError> {
|
||||
let image = self.application.image_name();
|
||||
|
||||
let domain = if topology.current_target() == DeploymentTarget::Production {
|
||||
self.application.dns()
|
||||
} else {
|
||||
topology
|
||||
.get_domain(&self.application.name())
|
||||
.await
|
||||
.map_err(|e| e.to_string())?
|
||||
};
|
||||
|
||||
// TODO Write CI/CD workflow files
|
||||
// we can autotedect the CI type using the remote url (default to github action for github
|
||||
// url, etc..)
|
||||
// Or ask for it when unknown
|
||||
|
||||
let helm_chart = self
|
||||
.application
|
||||
.build_push_helm_package(&image, &domain)
|
||||
.await?;
|
||||
let helm_chart = self.application.build_push_helm_package(&image).await?;
|
||||
|
||||
// TODO: Make building image configurable/skippable if image already exists (prompt)")
|
||||
// https://git.nationtech.io/NationTech/harmony/issues/104
|
||||
@@ -215,12 +203,12 @@ impl<
|
||||
};
|
||||
|
||||
Ok(InstallationOutcome::success_with_details(vec![format!(
|
||||
"{}: http://{domain}",
|
||||
"{}",
|
||||
self.application.name()
|
||||
)]))
|
||||
}
|
||||
fn name(&self) -> String {
|
||||
"ContinuousDelivery".to_string()
|
||||
"PackagingDeployment".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
446
harmony/src/modules/application/helm/mod.rs
Normal file
446
harmony/src/modules/application/helm/mod.rs
Normal file
@@ -0,0 +1,446 @@
|
||||
// Re-export common Kubernetes types for convenience
|
||||
pub use k8s_openapi::api::{
|
||||
apps::v1::{Deployment, DeploymentSpec},
|
||||
core::v1::{
|
||||
Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
|
||||
ServicePort, ServiceSpec,
|
||||
},
|
||||
};
|
||||
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||
use kube::core::ObjectMeta;
|
||||
|
||||
// Import domain types for the deployment builder
|
||||
use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Enum representing all supported Kubernetes resource types for Helm charts.
|
||||
/// Supports built-in typed resources and custom CRDs via YAML strings.
|
||||
pub enum HelmResourceKind {
|
||||
/// Built-in typed Service resource
|
||||
Service(K8sService),
|
||||
/// Built-in typed Deployment resource
|
||||
Deployment(Deployment),
|
||||
/// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
|
||||
CustomYaml { filename: String, content: String },
|
||||
// Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
|
||||
}
|
||||
|
||||
impl HelmResourceKind {
|
||||
pub fn filename(&self) -> String {
|
||||
match self {
|
||||
HelmResourceKind::Service(_) => "service.yaml".to_string(),
|
||||
HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
|
||||
HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
|
||||
match self {
|
||||
HelmResourceKind::Service(s) => serde_yaml::to_string(s),
|
||||
HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
|
||||
HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_service(&self) -> Option<&K8sService> {
|
||||
match self {
|
||||
HelmResourceKind::Service(s) => Some(s),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_deployment(&self) -> Option<&Deployment> {
|
||||
match self {
|
||||
HelmResourceKind::Deployment(d) => Some(d),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a custom resource from any serializable type (e.g., CRDs, custom types)
|
||||
pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
|
||||
HelmResourceKind::CustomYaml {
|
||||
filename: filename.into(),
|
||||
content: content.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a custom resource from any type that implements Serialize
|
||||
pub fn from_serializable<T: serde::Serialize>(
|
||||
filename: impl Into<String>,
|
||||
resource: &T,
|
||||
) -> Result<Self, serde_yaml::Error> {
|
||||
Ok(HelmResourceKind::CustomYaml {
|
||||
filename: filename.into(),
|
||||
content: serde_yaml::to_string(resource)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The main orchestrator for building a Helm chart.
|
||||
pub struct HelmChart {
|
||||
pub name: String,
|
||||
pub version: String,
|
||||
pub app_version: String,
|
||||
pub description: String,
|
||||
pub resources: Vec<HelmResourceKind>,
|
||||
pub values: Vec<String>,
|
||||
}
|
||||
|
||||
impl HelmChart {
|
||||
pub fn new(name: String, app_version: String) -> Self {
|
||||
Self {
|
||||
name: name.clone(),
|
||||
version: "0.1.0".to_string(),
|
||||
app_version,
|
||||
description: format!("A Helm chart for {}", name),
|
||||
resources: Vec::new(),
|
||||
values: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_resource(&mut self, resource: HelmResourceKind) {
|
||||
self.resources.push(resource);
|
||||
}
|
||||
|
||||
pub fn add_value(&mut self, key: &str, value: &str) {
|
||||
self.values.push(format!("{}: {}", key, value));
|
||||
}
|
||||
|
||||
pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||
let chart_dir = base_path.join(&self.name);
|
||||
let templates_dir = chart_dir.join("templates");
|
||||
fs::create_dir_all(&templates_dir)?;
|
||||
|
||||
// 1. Render and write Chart.yaml
|
||||
let chart_yaml = ChartYaml {
|
||||
name: &self.name,
|
||||
description: &self.description,
|
||||
version: &self.version,
|
||||
app_version: &self.app_version,
|
||||
};
|
||||
fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
|
||||
|
||||
// 2. Write values.yaml (Constructed dynamically)
|
||||
let values_content = self.values.join("\n");
|
||||
fs::write(chart_dir.join("values.yaml"), values_content)?;
|
||||
|
||||
// 3. Serialize and write all added resources (Deployment, Service, etc.)
|
||||
for resource in &self.resources {
|
||||
let filename = resource.filename();
|
||||
let content = resource
|
||||
.serialize_to_yaml()
|
||||
.map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
|
||||
fs::write(templates_dir.join(filename), content)?;
|
||||
}
|
||||
|
||||
Ok(chart_dir)
|
||||
}
|
||||
}
|
||||
|
||||
use askama::Template;
|
||||
|
||||
#[derive(Template)]
|
||||
#[template(path = "helm/Chart.yaml.j2")]
|
||||
struct ChartYaml<'a> {
|
||||
name: &'a str,
|
||||
description: &'a str,
|
||||
version: &'a str,
|
||||
app_version: &'a str,
|
||||
}
|
||||
|
||||
/// Builder for creating a Kubernetes Service with proper labels and selectors.
|
||||
pub struct ServiceBuilder {
|
||||
name: String,
|
||||
service_type: String,
|
||||
ports: Vec<ServicePort>,
|
||||
selector_label: String,
|
||||
}
|
||||
|
||||
impl ServiceBuilder {
|
||||
pub fn new(name: impl Into<String>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
service_type: "ClusterIP".to_string(),
|
||||
ports: Vec::new(),
|
||||
selector_label: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
|
||||
self.service_type = service_type.into();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_port(
|
||||
mut self,
|
||||
name: impl Into<String>,
|
||||
port: i32,
|
||||
protocol: impl Into<String>,
|
||||
) -> Self {
|
||||
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||
self.ports.push(ServicePort {
|
||||
name: Some(name.into()),
|
||||
protocol: Some(protocol.into()),
|
||||
port,
|
||||
target_port: Some(IntOrString::Int(port)),
|
||||
..Default::default()
|
||||
});
|
||||
self
|
||||
}
|
||||
|
||||
pub fn selector_label(mut self, label: impl Into<String>) -> Self {
|
||||
self.selector_label = label.into();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> K8sService {
|
||||
K8sService {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(self.name.clone()),
|
||||
labels: Some(
|
||||
[
|
||||
("app.kubernetes.io/name".to_string(), self.name.clone()),
|
||||
(
|
||||
"app.kubernetes.io/component".to_string(),
|
||||
"service".to_string(),
|
||||
),
|
||||
(
|
||||
"app.kubernetes.io/managed-by".to_string(),
|
||||
"harmony".to_string(),
|
||||
),
|
||||
]
|
||||
.into(),
|
||||
),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(ServiceSpec {
|
||||
type_: Some(self.service_type),
|
||||
selector: Some(
|
||||
[("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
|
||||
),
|
||||
ports: if self.ports.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(self.ports)
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for creating a Kubernetes Deployment with pod template and container spec.
|
||||
pub struct DeploymentBuilder {
|
||||
name: String,
|
||||
image: String,
|
||||
replicas: i32,
|
||||
container_ports: Vec<ContainerPort>,
|
||||
env_vars: Vec<EnvVar>,
|
||||
image_pull_policy: Option<String>,
|
||||
}
|
||||
|
||||
impl DeploymentBuilder {
|
||||
/// Create a new DeploymentBuilder with minimal required fields.
|
||||
pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
|
||||
Self::with_options(name, image, None, None, None)
|
||||
}
|
||||
|
||||
/// Create a new DeploymentBuilder with optional initial configuration.
|
||||
///
|
||||
/// Arguments:
|
||||
/// - `name`: The deployment name
|
||||
/// - `image`: The container image to use
|
||||
/// - `ports`: Optional vector of initial application network ports
|
||||
/// - `env_vars`: Optional vector of initial environment variable key-value pairs
|
||||
/// - `replicas`: Optional number of replicas (defaults to 1)
|
||||
pub fn with_options(
|
||||
name: impl Into<String>,
|
||||
image: impl Into<String>,
|
||||
ports: Option<Vec<ApplicationNetworkPort>>,
|
||||
env_vars: Option<Vec<(String, String)>>,
|
||||
replicas: Option<i32>,
|
||||
) -> Self {
|
||||
let container_ports: Vec<ContainerPort> = ports
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(|port| ContainerPort {
|
||||
container_port: port.number as i32,
|
||||
name: Some(port.name),
|
||||
protocol: Some(port.protocol.to_string()),
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let k8s_env_vars: Vec<EnvVar> = env_vars
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(|(key, value)| EnvVar {
|
||||
name: key,
|
||||
value: Some(value),
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
name: name.into(),
|
||||
image: image.into(),
|
||||
replicas: replicas.unwrap_or(1),
|
||||
container_ports,
|
||||
env_vars: k8s_env_vars,
|
||||
image_pull_policy: Some("IfNotPresent".to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn replicas(mut self, replicas: i32) -> Self {
|
||||
self.replicas = replicas;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_container_port(
|
||||
mut self,
|
||||
number: i32,
|
||||
name: impl Into<String>,
|
||||
protocol: impl Into<String>,
|
||||
) -> Self {
|
||||
self.container_ports.push(ContainerPort {
|
||||
container_port: number,
|
||||
name: Some(name.into()),
|
||||
protocol: Some(protocol.into()),
|
||||
..Default::default()
|
||||
});
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
|
||||
self.env_vars.push(EnvVar {
|
||||
name: name.into(),
|
||||
value: Some(value.into()),
|
||||
..Default::default()
|
||||
});
|
||||
self
|
||||
}
|
||||
|
||||
pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
|
||||
self.image_pull_policy = Some(policy.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> Deployment {
|
||||
let name = self.name.clone();
|
||||
Deployment {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
labels: Some(
|
||||
[
|
||||
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||
(
|
||||
"app.kubernetes.io/component".to_string(),
|
||||
"deployment".to_string(),
|
||||
),
|
||||
(
|
||||
"app.kubernetes.io/managed-by".to_string(),
|
||||
"harmony".to_string(),
|
||||
),
|
||||
("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
|
||||
]
|
||||
.into(),
|
||||
),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(DeploymentSpec {
|
||||
replicas: Some(self.replicas),
|
||||
selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
|
||||
match_labels: Some(
|
||||
[("app.kubernetes.io/name".to_string(), name.clone())].into(),
|
||||
),
|
||||
..Default::default()
|
||||
},
|
||||
template: PodTemplateSpec {
|
||||
metadata: Some(ObjectMeta {
|
||||
labels: Some(
|
||||
[
|
||||
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||
("app.kubernetes.io/instance".to_string(), name.clone()),
|
||||
]
|
||||
.into(),
|
||||
),
|
||||
..Default::default()
|
||||
}),
|
||||
spec: Some(PodSpec {
|
||||
containers: vec![Container {
|
||||
name: name.clone(),
|
||||
image: Some(self.image),
|
||||
image_pull_policy: self.image_pull_policy,
|
||||
ports: if self.container_ports.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(self.container_ports)
|
||||
},
|
||||
env: if self.env_vars.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(self.env_vars)
|
||||
},
|
||||
..Default::default()
|
||||
}],
|
||||
..Default::default()
|
||||
}),
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a Service from network port configuration.
|
||||
/// Returns `None` if no ports are provided.
|
||||
pub fn create_service_from_ports(
|
||||
name: String,
|
||||
network_ports: &[ApplicationNetworkPort],
|
||||
) -> Option<K8sService> {
|
||||
if network_ports.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ports: Vec<ServicePort> = network_ports
|
||||
.into_iter()
|
||||
.map(|port| ServicePort {
|
||||
name: Some(port.name.clone()),
|
||||
protocol: Some(port.protocol.to_string()),
|
||||
port: port.number as i32,
|
||||
target_port: Some(IntOrString::Int(port.number as i32)),
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
Some(K8sService {
|
||||
metadata: ObjectMeta {
|
||||
name: Some(name.clone()),
|
||||
labels: Some(
|
||||
[
|
||||
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||
(
|
||||
"app.kubernetes.io/component".to_string(),
|
||||
"service".to_string(),
|
||||
),
|
||||
(
|
||||
"app.kubernetes.io/managed-by".to_string(),
|
||||
"harmony".to_string(),
|
||||
),
|
||||
]
|
||||
.into(),
|
||||
),
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(ServiceSpec {
|
||||
type_: Some("ClusterIP".to_string()),
|
||||
selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
|
||||
ports: Some(ports),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
@@ -1,5 +1,8 @@
|
||||
pub mod backend_app;
|
||||
pub mod config;
|
||||
mod feature;
|
||||
pub mod features;
|
||||
pub mod helm;
|
||||
pub mod oci;
|
||||
mod rust;
|
||||
mod webapp;
|
||||
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the output of a process command for success.
|
||||
fn check_output(
|
||||
output: &std::process::Output,
|
||||
msg: &str,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
if !output.status.success() {
|
||||
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
|
||||
return Err(error_message.into());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use crate::{
|
||||
config::{REGISTRY_PROJECT, REGISTRY_URL},
|
||||
modules::application::check_output,
|
||||
};
|
||||
|
||||
use super::Application;
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
|
||||
#[async_trait]
|
||||
pub trait OCICompliant: Application {
|
||||
@@ -17,9 +25,74 @@ pub trait HelmPackage: Application {
|
||||
/// # Arguments
|
||||
/// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
|
||||
/// * `domain` - The domain where the application is hosted.
|
||||
async fn build_push_helm_package(
|
||||
&self,
|
||||
image_url: &str,
|
||||
domain: &str,
|
||||
) -> Result<String, String>;
|
||||
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
|
||||
|
||||
fn project_root(&self) -> PathBuf;
|
||||
|
||||
fn chart_name(&self) -> String;
|
||||
|
||||
/// Packages a Helm chart directory into a .tgz file.
|
||||
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
|
||||
debug!(
|
||||
"Launching `helm package {}` cli with CWD {}",
|
||||
chart_dirname.to_string_lossy(),
|
||||
&self
|
||||
.project_root()
|
||||
.join(".harmony_generated")
|
||||
.join("helm")
|
||||
.to_string_lossy()
|
||||
);
|
||||
let output = std::process::Command::new("helm")
|
||||
.args(["package", chart_dirname.to_str().unwrap()])
|
||||
.current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
|
||||
.output()?;
|
||||
|
||||
check_output(&output, "Failed to package Helm chart")?;
|
||||
|
||||
// Helm prints the path of the created chart to stdout.
|
||||
let tgz_name = String::from_utf8(output.stdout)?
|
||||
.split_whitespace()
|
||||
.last()
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
if tgz_name.is_empty() {
|
||||
return Err("Could not determine packaged chart filename.".into());
|
||||
}
|
||||
|
||||
// The output from helm is relative, so we join it with the execution directory.
|
||||
Ok(self
|
||||
.project_root()
|
||||
.join(".harmony_generated")
|
||||
.join("helm")
|
||||
.join(tgz_name))
|
||||
}
|
||||
|
||||
/// Pushes a packaged Helm chart to an OCI registry.
|
||||
fn push_helm_chart(
|
||||
&self,
|
||||
packaged_chart_path: &Path,
|
||||
) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// The chart name is the file stem of the .tgz file
|
||||
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
|
||||
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
|
||||
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
|
||||
debug!(
|
||||
"Pushing Helm chart {} to {}",
|
||||
packaged_chart_path.to_string_lossy(),
|
||||
oci_push_url
|
||||
);
|
||||
|
||||
let output = std::process::Command::new("helm")
|
||||
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
|
||||
.output()?;
|
||||
|
||||
check_output(&output, "Pushing Helm chart failed")?;
|
||||
|
||||
// The final URL includes the version tag, which is part of the file name
|
||||
let version = chart_file_name.rsplit_once('-').unwrap().1;
|
||||
debug!("pull url {oci_pull_url}");
|
||||
debug!("push url {oci_push_url}");
|
||||
Ok(format!("{}:{}", oci_pull_url, version))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {
|
||||
|
||||
#[async_trait]
|
||||
impl HelmPackage for RustWebapp {
|
||||
async fn build_push_helm_package(
|
||||
&self,
|
||||
image_url: &str,
|
||||
domain: &str,
|
||||
) -> Result<String, String> {
|
||||
fn project_root(&self) -> PathBuf {
|
||||
self.project_root.clone()
|
||||
}
|
||||
|
||||
fn chart_name(&self) -> String {
|
||||
self.name.clone()
|
||||
}
|
||||
|
||||
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
|
||||
let domain = self.dns();
|
||||
info!("Starting Helm chart build and push for '{}'", self.name);
|
||||
|
||||
// 1. Create the Helm chart files on disk.
|
||||
let chart_dir = self
|
||||
.create_helm_chart_files(image_url, domain)
|
||||
.create_helm_chart_files(image_url, &domain)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
|
||||
info!("Successfully created Helm chart files in {:?}", chart_dir);
|
||||
@@ -327,19 +332,6 @@ impl RustWebapp {
|
||||
Ok(image_tag.to_string())
|
||||
}
|
||||
|
||||
/// Checks the output of a process command for success.
|
||||
fn check_output(
|
||||
&self,
|
||||
output: &process::Output,
|
||||
msg: &str,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
if !output.status.success() {
|
||||
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
|
||||
return Err(error_message.into());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
|
||||
match self.framework {
|
||||
Some(RustWebFramework::Leptos) => {
|
||||
@@ -640,71 +632,6 @@ spec:
|
||||
Ok(chart_dir)
|
||||
}
|
||||
|
||||
/// Packages a Helm chart directory into a .tgz file.
|
||||
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
|
||||
debug!(
|
||||
"Launching `helm package {}` cli with CWD {}",
|
||||
chart_dirname.to_string_lossy(),
|
||||
&self
|
||||
.project_root
|
||||
.join(".harmony_generated")
|
||||
.join("helm")
|
||||
.to_string_lossy()
|
||||
);
|
||||
let output = process::Command::new("helm")
|
||||
.args(["package", chart_dirname.to_str().unwrap()])
|
||||
.current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
|
||||
.output()?;
|
||||
|
||||
self.check_output(&output, "Failed to package Helm chart")?;
|
||||
|
||||
// Helm prints the path of the created chart to stdout.
|
||||
let tgz_name = String::from_utf8(output.stdout)?
|
||||
.split_whitespace()
|
||||
.last()
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
if tgz_name.is_empty() {
|
||||
return Err("Could not determine packaged chart filename.".into());
|
||||
}
|
||||
|
||||
// The output from helm is relative, so we join it with the execution directory.
|
||||
Ok(self
|
||||
.project_root
|
||||
.join(".harmony_generated")
|
||||
.join("helm")
|
||||
.join(tgz_name))
|
||||
}
|
||||
|
||||
/// Pushes a packaged Helm chart to an OCI registry.
|
||||
fn push_helm_chart(
|
||||
&self,
|
||||
packaged_chart_path: &Path,
|
||||
) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// The chart name is the file stem of the .tgz file
|
||||
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
|
||||
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
|
||||
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
|
||||
debug!(
|
||||
"Pushing Helm chart {} to {}",
|
||||
packaged_chart_path.to_string_lossy(),
|
||||
oci_push_url
|
||||
);
|
||||
|
||||
let output = process::Command::new("helm")
|
||||
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
|
||||
.output()?;
|
||||
|
||||
self.check_output(&output, "Pushing Helm chart failed")?;
|
||||
|
||||
// The final URL includes the version tag, which is part of the file name
|
||||
let version = chart_file_name.rsplit_once('-').unwrap().1;
|
||||
debug!("pull url {oci_pull_url}");
|
||||
debug!("push url {oci_push_url}");
|
||||
Ok(format!("{}:{}", oci_pull_url, version))
|
||||
}
|
||||
|
||||
fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||
let existing_dockerfile = self.project_root.join("Dockerfile");
|
||||
|
||||
|
||||
6
harmony/templates/helm/Chart.yaml.j2
Normal file
6
harmony/templates/helm/Chart.yaml.j2
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: v2
|
||||
name: {{ name }}
|
||||
description: {{ description }}
|
||||
type: application
|
||||
version: {{ version }}
|
||||
appVersion: "{{ app_version }}"
|
||||
4
harmony_agent/.dockerignore
Normal file
4
harmony_agent/.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
.git
|
||||
data
|
||||
target
|
||||
demos
|
||||
26
harmony_agent/Cargo.toml
Normal file
26
harmony_agent/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "harmony_agent"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../harmony" }
|
||||
# harmony_cli = { path = "../harmony_cli" }
|
||||
harmony_types = { path = "../harmony_types" }
|
||||
harmony_macros = { path = "../harmony_macros" }
|
||||
cidr = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
log = { workspace = true }
|
||||
env_logger = { workspace = true }
|
||||
async-nats = "0.45.0"
|
||||
async-trait = "0.1"
|
||||
# url = { workspace = true }
|
||||
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
getrandom = "0.3.4"
|
||||
|
||||
thiserror.workspace = true
|
||||
pretty_assertions.workspace = true
|
||||
44
harmony_agent/Dockerfile
Normal file
44
harmony_agent/Dockerfile
Normal file
@@ -0,0 +1,44 @@
|
||||
# Build stage
|
||||
FROM rust:slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy all required packages
|
||||
COPY . .
|
||||
|
||||
RUN ls -la1
|
||||
|
||||
# Build the application in release mode
|
||||
RUN cargo build --release -p harmony_agent
|
||||
|
||||
# Runtime stage
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the binary from the builder stage
|
||||
COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
|
||||
|
||||
# Declare environment variables used by the Harmony Agent
|
||||
# These will be set from build-time environment variables if present
|
||||
# NATS_URL: URL of the NATS server (default: nats://localhost:4222)
|
||||
ARG NATS_URL=nats://localhost:4222
|
||||
ENV NATS_URL=${NATS_URL}
|
||||
# NATS_CREDS_PATH: Optional path to NATS credentials file
|
||||
ARG NATS_CREDS_PATH
|
||||
ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
|
||||
# MY_CLUSTER_ID: This cluster's unique identifier (required)
|
||||
ARG MY_CLUSTER_ID
|
||||
ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
|
||||
# DESIRED_PRIMARY: The ID of the desired primary cluster (required)
|
||||
ARG DESIRED_PRIMARY
|
||||
ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
|
||||
|
||||
# Run the application
|
||||
ENTRYPOINT ["./harmony_agent"]
|
||||
248
harmony_agent/README.md
Normal file
248
harmony_agent/README.md
Normal file
@@ -0,0 +1,248 @@
|
||||
TODO
|
||||
|
||||
DONE:
|
||||
1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
|
||||
2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
|
||||
3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
|
||||
4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
|
||||
5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
|
||||
6. ✅ failover_timeout added to AgentConfig
|
||||
7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
|
||||
8. ✅ startup reconciliation implemented via on_startup() method
|
||||
|
||||
REMAINING:
|
||||
- review all code and list implementation issues
|
||||
- review both workflow for each state transition
|
||||
- Complete replica workflow staleness detection (needs implementation in Watching state)
|
||||
- Implement state recovery from Failed state for both workflows
|
||||
- Implement subscribe in NATS store with watch() API
|
||||
- Implement config validation for failover_timeout constraints
|
||||
|
||||
TODO
|
||||
|
||||
1. store trait subscribe definition missing callback
|
||||
2. BUG, data integrity issue : nats store not actually using jetstream metadata
|
||||
3. review all code and list implementation issues
|
||||
4. review both workflow for each state transition
|
||||
5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
|
||||
6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
|
||||
should hold it probably, every agent should be subscribed to the cluster_state object and
|
||||
keep it in memory to allow workflows to process against it efficiently)
|
||||
|
||||
## CRITICAL - Data Integrity Issues
|
||||
|
||||
1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
|
||||
- Currently uses `put()` which overwrites unconditionally
|
||||
- Must use `update()` with revision parameter for proper compare-and-set
|
||||
- Without this, concurrent promotion attempts can cause split brain
|
||||
|
||||
2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
|
||||
- Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
|
||||
- NATS Entry has `.revision` and `.created` fields that must be used
|
||||
- This defeats the entire purpose of store-provided timestamps
|
||||
|
||||
3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
|
||||
- Line ~156: TODO comment confirms missing metadata passing
|
||||
- Replica cannot calculate staleness without metadata.timestamp
|
||||
- Failover logic is broken
|
||||
|
||||
4. **No actual cluster state watching exists**
|
||||
- Replica workflow declares `ClusterState` but never updates it
|
||||
- No subscription to primary heartbeat or cluster_state key
|
||||
- Replica cannot detect primary liveness
|
||||
|
||||
## HIGH - Missing Core Functionality
|
||||
|
||||
5. **Replica Workflow incomplete** - All key logic is TODO:
|
||||
- Watching primary staleness (line 114)
|
||||
- Promotion attempt (line 118)
|
||||
- Original primary recovery detection (line 127)
|
||||
- Demotion/handshake (line 131)
|
||||
|
||||
6. **Missing replica "Failed" state**
|
||||
- `ReplicaState` enum has no `Failed` variant
|
||||
- User's TODO #5 correctly identifies this gap
|
||||
- What happens if replica's own heartbeats fail repeatedly?
|
||||
|
||||
7. **Primary Workflow incomplete** - Key logic missing:
|
||||
- No NATS check before recovering from `Fenced` state (line 95)
|
||||
- No NATS check in `Yielding` state for demotion handshake (line 101)
|
||||
- No actual fencing failure handling
|
||||
|
||||
8. **Store `subscribe` not implemented** (`store/mod.rs`)
|
||||
- Returns `todo!()` in NATS implementation
|
||||
- No callback mechanism defined in trait
|
||||
- Without this, agents cannot react to state changes
|
||||
|
||||
9. **Cluster state not tracked centrally**
|
||||
- User's TODO #6 correctly identifies this
|
||||
- Each agent should maintain a local copy of cluster_state
|
||||
- No subscription mechanism to update this local copy
|
||||
|
||||
10. **No validation of configuration constraints**
|
||||
- Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
|
||||
- Invalid config could cause split brain
|
||||
|
||||
## MEDIUM - Incorrect State Transitions
|
||||
|
||||
11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
|
||||
- Two state transitions happen in one heartbeat cycle
|
||||
- Should stay in `Failed` until fencing actually completes
|
||||
- What if fencing fails? State machine won't reflect it
|
||||
|
||||
12. **No fencing failure handling**
|
||||
- If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
|
||||
- ADR mentions escalating to radical measures, but no callback for failure
|
||||
|
||||
13. **Replica `Watching` state does nothing**
|
||||
- Line 115: Just logs, checks nothing
|
||||
- Should be checking staleness of primary heartbeat
|
||||
|
||||
14. **Demotion handshake not implemented**
|
||||
- ADR section 4 details this but code doesn't implement it
|
||||
- How does original primary know it should yield?
|
||||
|
||||
## LOW - Observability & Reliability
|
||||
|
||||
15. **No graceful shutdown mechanism**
|
||||
- `run_heartbeat_loop` runs forever
|
||||
- No signal handling (SIGTERM, SIGINT)
|
||||
|
||||
16. **Async task errors silently ignored**
|
||||
- `tokio::spawn` at lines 74, 83, 123
|
||||
- No `JoinHandle` retention or error handling
|
||||
|
||||
17. **No metrics/observability**
|
||||
- Only log output
|
||||
- No Prometheus metrics for state transitions, failure counts, etc.
|
||||
|
||||
18. **Hardcoded main() function** (`agent_loop.rs::main`)
|
||||
- Not production-ready entry point
|
||||
- Should load config from environment or file
|
||||
|
||||
19. **Store factory pattern missing**
|
||||
- TODO comment at line 54 confirms this
|
||||
- Can't switch between stores via config
|
||||
|
||||
20. **No backoff/retry logic for NATS operations**
|
||||
- Transient failures could trigger unnecessary fencing
|
||||
|
||||
21. **`AgentInfo` status is hardcoded to "HEALTHY"**
|
||||
- Line 137 in `store_heartbeat`
|
||||
- Should反映 actual workflow state
|
||||
|
||||
22. **Unused fields in structs**
|
||||
- `HeartbeatState.last_seq` set but never read
|
||||
- `ClusterState.current_primary` set but never read
|
||||
|
||||
## ADR-017-3 Compliance Issues
|
||||
|
||||
23. **ADR violation: Clock skew not avoided**
|
||||
- While ADR says use store metadata, code uses local time
|
||||
|
||||
24. **Failover timeout not configurable**
|
||||
- Defined in ADR but not in `AgentConfig`
|
||||
- Needed for replica staleness calculation
|
||||
|
||||
25. **Safety margin concept exists in ADR but not in code**
|
||||
- Configuration should include this margin
|
||||
|
||||
26. **No handling of Case 3 (Replica Network Lag)**
|
||||
- ADR describes NATS rejection prevention
|
||||
- But `set_strict` implementation accepts any write
|
||||
|
||||
## Code Quality Issues
|
||||
|
||||
27. **Inconsistent error handling**
|
||||
- Some paths return `Err`, others `todo!()`, others ignore
|
||||
|
||||
28. **Unnecessary `Clone` bounds**
|
||||
- `DeploymentConfig.clone()` used frequently
|
||||
- Could be optimized with `Arc`
|
||||
|
||||
29. **Missing lifetime annotations**
|
||||
- `KvStore::get` returns `String` key in error - inefficient
|
||||
|
||||
30. **No integration points mentioned**
|
||||
- PostgreSQL lifecycle control implementation missing
|
||||
- Fencing via CNPG not connected
|
||||
|
||||
## Production Readiness Checklist Summary
|
||||
|
||||
For battle testing preparation, you need:
|
||||
|
||||
**Immediate ( blockers):**
|
||||
- Fix NATS store metadata usage (issues #1, #2)
|
||||
- Implement strict set_strict with actual CAS (#1)
|
||||
- Implement replica primary watching (#4, #5)
|
||||
- Add failover_timeout config + staleness logic (#3, #24)
|
||||
- Implement subscribe mechanism with callbacks (#8)
|
||||
|
||||
**High priority:**
|
||||
- Complete all workflow transitions (#5, #7, #11-14)
|
||||
- Add cluster state tracking (#6, #9)
|
||||
- Add configuration validation (#10)
|
||||
- Add Replica Failed state (#6)
|
||||
|
||||
**Before deployment:**
|
||||
- Implement graceful shutdown (#15)
|
||||
- Add error handling for spawned tasks (#16)
|
||||
- Remove hardcoded main function (#18)
|
||||
- Implement store factory (#19)
|
||||
- Add Prometheus metrics (#17)
|
||||
|
||||
**Documentation:**
|
||||
- Document all configuration parameters and their trade-offs
|
||||
- Add runbooks for each failure mode
|
||||
- Document battle test scenarios to cover
|
||||
|
||||
### Addendum: Missing Critical Issues
|
||||
|
||||
#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
|
||||
* **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
|
||||
* **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
|
||||
* **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
|
||||
* **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
|
||||
|
||||
#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
|
||||
* **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
|
||||
* **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
|
||||
* **Scenario:**
|
||||
1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
|
||||
2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
|
||||
3. `on_active` finishes *before* `on_failover`.
|
||||
4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
|
||||
* **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
|
||||
|
||||
#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
|
||||
* **Location:** `agent_loop.rs` loop logic.
|
||||
* **The Bug:** There is no "Stop the World" gate.
|
||||
* **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
|
||||
* **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
|
||||
* **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
|
||||
|
||||
#### 4. HIGH: NATS Bucket Name Collision
|
||||
* **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
|
||||
* **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
|
||||
* **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
|
||||
* **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
|
||||
|
||||
#### 5. HIGH: Startup State Reconciliation
|
||||
* **Location:** `HarmonyAgent::new`.
|
||||
* **The Bug:** Agents always start in `Initializing`.
|
||||
* **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
|
||||
* **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
|
||||
* **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
|
||||
|
||||
### Summary of Tasks to Add
|
||||
|
||||
Please add these to your master list before starting implementation:
|
||||
|
||||
28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
|
||||
29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
|
||||
30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
|
||||
31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
|
||||
32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
|
||||
|
||||
* **Think about vacuum / stop-the-world operations**
|
||||
|
||||
20
harmony_agent/deploy/Cargo.toml
Normal file
20
harmony_agent/deploy/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "harmony_agent_deploy"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { path = "../../harmony" }
|
||||
harmony_cli = { path = "../../harmony_cli" }
|
||||
harmony_types = { path = "../../harmony_types" }
|
||||
harmony_macros = { path = "../../harmony_macros" }
|
||||
cidr = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
log = { workspace = true }
|
||||
env_logger = { workspace = true }
|
||||
url = { workspace = true }
|
||||
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
63
harmony_agent/deploy/src/main.rs
Normal file
63
harmony_agent/deploy/src/main.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::{
|
||||
application::{
|
||||
ApplicationScore,
|
||||
backend_app::{BackendApp, BuildCommand},
|
||||
features::{Monitoring, PackagingDeployment},
|
||||
},
|
||||
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
|
||||
},
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
use harmony_macros::hurl;
|
||||
use harmony_types::k8s_name::K8sName;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let application = Arc::new(BackendApp {
|
||||
name: "harmony-agent".to_string(),
|
||||
// Since harmony_agent is part of the harmony workspace, the actual "project root"
|
||||
// is not harmony_agent folder but the workspace root.
|
||||
//
|
||||
// So using ../ here means we MUST run this deployment script from the harmony_agent
|
||||
// folder
|
||||
project_root: PathBuf::from("../"),
|
||||
network_ports: vec![],
|
||||
env_vars: vec![
|
||||
("NATS_URL".to_string(), "nats://nats".to_string()),
|
||||
("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
|
||||
("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
|
||||
("NATS_CREDS_PATH".to_string(), "".to_string()),
|
||||
],
|
||||
build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
|
||||
dockerfile: Some(PathBuf::from("Dockerfile")),
|
||||
});
|
||||
|
||||
let app = ApplicationScore {
|
||||
features: vec![
|
||||
Box::new(PackagingDeployment {
|
||||
application: application.clone(),
|
||||
}),
|
||||
Box::new(Monitoring {
|
||||
application: application.clone(),
|
||||
alert_receiver: vec![Box::new(DiscordWebhook {
|
||||
name: K8sName("test-discord".to_string()),
|
||||
url: hurl!("https://discord.doesnt.exist.com"),
|
||||
selectors: vec![],
|
||||
})],
|
||||
}),
|
||||
],
|
||||
application,
|
||||
};
|
||||
|
||||
harmony_cli::run(
|
||||
Inventory::autoload(),
|
||||
K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
|
||||
vec![Box::new(app)],
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
79
harmony_agent/src/agent/config.rs
Normal file
79
harmony_agent/src/agent/config.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use harmony_types::id::Id;
|
||||
use log::info;
|
||||
|
||||
use super::heartbeat::HeartbeatFailure;
|
||||
use super::role::AgentRole;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AgentConfig {
|
||||
/// Number of consecutive successful heartbeats required before the service transitions from
|
||||
/// failed to healthy.
|
||||
pub success_threshold: usize,
|
||||
/// Number of consecutive failed heartbeats required before the service transitions from
|
||||
/// healthy to failed.
|
||||
pub failure_threshold: usize,
|
||||
/// Time between each heartbeat. If a heartbeat takes longer than this, it will be
|
||||
/// considered failed.
|
||||
pub heartbeat_interval: Duration,
|
||||
/// Time since last observed primary heartbeat before replica considers primary stale.
|
||||
/// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
|
||||
/// to avoid split brain during network partitions.
|
||||
pub failover_timeout: Duration,
|
||||
/// **UNSTABLE FIELD**
|
||||
///
|
||||
/// For now, an agent instance only serves one deployment. This is probably fine as an agent's
|
||||
/// footprint is low, but managing multiple deployments in a single instance would be a
|
||||
/// significant resource usage reduction.
|
||||
///
|
||||
/// Decoupling the deployment of the agent with the application's deployment could make things
|
||||
/// more complicated though, where we would have to be careful about version compatibility
|
||||
/// between all components managed by the agent instance. So for now it is a 1-1 map.
|
||||
///
|
||||
/// But I have a feeling this could change so I am marking this field unstable to warn you, the
|
||||
/// reader.
|
||||
pub deployment_config_unstable: DeploymentConfig,
|
||||
pub nats_url: String,
|
||||
pub nats_creds_path: Option<String>,
|
||||
pub agent_id: Id,
|
||||
pub cluster_id: Id,
|
||||
pub desired_primary_id: Id,
|
||||
/// The role this agent plays (Primary or Replica)
|
||||
pub role: AgentRole,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum DeploymentConfig {
|
||||
FailoverPostgreSQL(FailoverCNPGConfig),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FailoverCNPGConfig {
|
||||
pub cnpg_cluster_name: String,
|
||||
}
|
||||
|
||||
impl DeploymentConfig {
|
||||
/// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
|
||||
pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
|
||||
match self {
|
||||
DeploymentConfig::FailoverPostgreSQL(cfg) => {
|
||||
info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
|
||||
// TODO: Implement actual PG check / NATS write here
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Callback: Transitioned from Unhealthy -> Healthy
|
||||
pub async fn on_active(&self) {
|
||||
info!("Service is now ACTIVE (Healthy)");
|
||||
// e.g., Remove fencing lock
|
||||
}
|
||||
|
||||
/// Callback: Transitioned from Healthy -> Unhealthy
|
||||
pub async fn on_failover(&self) {
|
||||
info!("Service is now FAILED (Unhealthy)");
|
||||
// e.g., Initiate self-fencing, stop accepting traffic
|
||||
}
|
||||
}
|
||||
35
harmony_agent/src/agent/heartbeat.rs
Normal file
35
harmony_agent/src/agent/heartbeat.rs
Normal file
@@ -0,0 +1,35 @@
|
||||
use harmony_types::id::Id;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::store::KvMetadata;
|
||||
|
||||
/// Agent-provided heartbeat information (no timestamps - those come from the store)
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct AgentInfo {
|
||||
pub agent_id: Id,
|
||||
pub cluster_id: Id,
|
||||
pub status: String,
|
||||
}
|
||||
|
||||
/// Complete heartbeat with both agent data and store metadata
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct AgentHeartbeat {
|
||||
pub agent_info: AgentInfo,
|
||||
pub metadata: Option<KvMetadata>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ClusterStateData {
|
||||
pub cluster_info: ClusterState,
|
||||
pub metadata: Option<KvMetadata>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ClusterState {
|
||||
pub cluster_id: Id,
|
||||
pub current_primary: Option<Id>,
|
||||
pub desired_primary: Id,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HeartbeatFailure {}
|
||||
507
harmony_agent/src/agent/mod.rs
Normal file
507
harmony_agent/src/agent/mod.rs
Normal file
@@ -0,0 +1,507 @@
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use std::{str::FromStr, sync::Arc, time::Duration};
|
||||
|
||||
use harmony_types::id::Id;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::time::{Instant, sleep};
|
||||
|
||||
use crate::agent::heartbeat::ClusterState;
|
||||
use crate::store::{KvMetadata, KvStore, KvStoreError};
|
||||
use crate::workflow::HeartbeatWorkflow;
|
||||
use crate::workflow::primary::PrimaryWorkflow;
|
||||
use crate::workflow::replica::ReplicaWorkflow;
|
||||
|
||||
// Submodules
|
||||
mod config;
|
||||
pub mod heartbeat;
|
||||
mod role;
|
||||
|
||||
// Re-exports for backwards compatibility
|
||||
pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
|
||||
pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
|
||||
pub use role::AgentRole;
|
||||
|
||||
pub async fn launch_agent<S>(
|
||||
role: AgentRole,
|
||||
health_kv: Arc<S>,
|
||||
cluster_kv: Arc<S>,
|
||||
heartbeat_interval: Duration,
|
||||
failover_timeout: Duration,
|
||||
) -> Result<(), Box<dyn std::error::Error>>
|
||||
where
|
||||
S: KvStore + Send + Sync + 'static,
|
||||
{
|
||||
// Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
|
||||
// one so they don't crash because of the race
|
||||
match role {
|
||||
AgentRole::Primary => {}
|
||||
AgentRole::Replica => {
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
|
||||
let my_agent_name = format!("agent-{}", role);
|
||||
let my_agent_id = Id::from_str(&my_agent_name).unwrap();
|
||||
|
||||
let config = AgentConfig {
|
||||
role,
|
||||
success_threshold: 2,
|
||||
failure_threshold: 2,
|
||||
heartbeat_interval,
|
||||
failover_timeout,
|
||||
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
|
||||
cnpg_cluster_name: String::from("cnpg_cluster_name"),
|
||||
}),
|
||||
nats_url: String::new(),
|
||||
nats_creds_path: None,
|
||||
agent_id: my_agent_id,
|
||||
cluster_id: "cluster_test_id".into(),
|
||||
desired_primary_id: "primary_id".into(),
|
||||
};
|
||||
|
||||
log::info!("Harmony Agent Initialized");
|
||||
log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
|
||||
log::info!("Full config : {:?}", config);
|
||||
|
||||
// TODO load store based on config, default to nats
|
||||
// probably a good use case for a factory pattern
|
||||
|
||||
let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
|
||||
|
||||
agent.reconcile_startup().await?;
|
||||
|
||||
// Run the heartbeat loop
|
||||
agent.run_heartbeat_loop().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct HarmonyAgent<S: KvStore> {
|
||||
pub config: AgentConfig,
|
||||
workflow: Box<dyn HeartbeatWorkflow>,
|
||||
health_kv: Arc<S>,
|
||||
cluster_kv: Arc<S>,
|
||||
/// Last successful heartbeat, used to track sequence number for next write
|
||||
/// This avoids doing a GET before every SET, reducing network round-trips
|
||||
last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
|
||||
/// Local copy of cluster state, updated via subscription
|
||||
/// This allows workflows to make decisions without querying NATS each time
|
||||
cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
|
||||
}
|
||||
|
||||
impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
|
||||
pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
|
||||
let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
|
||||
AgentRole::Primary => {
|
||||
info!("Initializing agent as PRIMARY");
|
||||
Box::new(PrimaryWorkflow::new(
|
||||
config.success_threshold,
|
||||
config.failure_threshold,
|
||||
config.deployment_config_unstable.clone(),
|
||||
))
|
||||
}
|
||||
AgentRole::Replica => {
|
||||
info!("Initializing agent as REPLICA");
|
||||
Box::new(ReplicaWorkflow::new(
|
||||
config.success_threshold,
|
||||
config.failure_threshold,
|
||||
config.cluster_id.clone(),
|
||||
config.desired_primary_id.clone(),
|
||||
config.agent_id.clone(),
|
||||
config.failover_timeout,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
config,
|
||||
workflow,
|
||||
health_kv,
|
||||
cluster_kv,
|
||||
last_heartbeat: Arc::new(RwLock::new(None)),
|
||||
cluster_state: Arc::new(RwLock::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generic helper to fetch and deserialize data from KV store
|
||||
/// Returns Ok(Some(data)) if key exists and deserializes successfully
|
||||
/// Returns Ok(None) if key doesn't exist
|
||||
/// Returns Err if deserialization fails or other errors occur
|
||||
async fn fetch_from_store<D>(
|
||||
&self,
|
||||
store: &Arc<S>,
|
||||
key: &str,
|
||||
) -> Result<Option<(D, KvMetadata)>, KvStoreError>
|
||||
where
|
||||
D: serde::de::DeserializeOwned,
|
||||
{
|
||||
debug!("Fetching data from key: {}", key);
|
||||
|
||||
let result = store.get(key).await;
|
||||
debug!("Got result from store: {:#?}", result);
|
||||
|
||||
match result {
|
||||
Ok(kv_result) => {
|
||||
if let Some(value) = kv_result.value {
|
||||
match serde_json::from_value::<D>(value.clone()) {
|
||||
Ok(data) => Ok(Some((data, kv_result.metadata))),
|
||||
Err(e) => {
|
||||
log::warn!("Failed to deserialize data from key {}: {}", key, e);
|
||||
Err(KvStoreError::DeserializationFailed {
|
||||
deserialization_error: format!(
|
||||
"Key exists but deserialization failed for {key}: {e}"
|
||||
),
|
||||
value: value.to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Err(KvStoreError::Unknown(format!(
|
||||
"Key exists but value is empty for {key}, this should not happen"
|
||||
)))
|
||||
}
|
||||
}
|
||||
Err(KvStoreError::KeyNotAvailable(_)) => {
|
||||
debug!("Key {} not found in store", key);
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("Failed to fetch data from key {}: {}", key, e);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reconcile startup state by fetching cluster state and heartbeat from the store
|
||||
/// This allows the workflow to determine if it should resume as Primary/Replica
|
||||
/// based on the persisted cluster state
|
||||
pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
|
||||
let cluster_key = format!("cluster.{}", self.config.cluster_id);
|
||||
|
||||
debug!(
|
||||
"Fetching cluster state for startup reconciliation from key: {}",
|
||||
cluster_key
|
||||
);
|
||||
|
||||
let cluster_state_option = match self
|
||||
.fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
|
||||
.await?
|
||||
{
|
||||
Some((data, metadata)) => Some(ClusterStateData {
|
||||
cluster_info: data,
|
||||
metadata: Some(metadata),
|
||||
}),
|
||||
None => {
|
||||
debug!(
|
||||
"Cluster state key not found, this is a fresh cluster, initializing cluster state"
|
||||
);
|
||||
Some(self.store_cluster_state(None).await?)
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Found cluster state {cluster_state_option:#?}");
|
||||
self.workflow
|
||||
.on_startup(cluster_state_option.as_ref(), &self.config)
|
||||
.await;
|
||||
|
||||
// Cache the cluster state locally
|
||||
*self.cluster_state.write().await = cluster_state_option;
|
||||
// Fetch last heartbeat if it exists to avoid sequence conflicts
|
||||
let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
|
||||
debug!("Fetching last heartbeat from key: {}", heartbeat_key);
|
||||
|
||||
let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
|
||||
|
||||
let last_heartbeat = match last_heartbeat_option {
|
||||
Ok(kv_result) => {
|
||||
let value = kv_result
|
||||
.value
|
||||
.expect("When key exist it should always contain data");
|
||||
Some(AgentHeartbeat {
|
||||
agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
|
||||
|e| KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: value.to_string(),
|
||||
},
|
||||
)?,
|
||||
metadata: Some(kv_result.metadata),
|
||||
})
|
||||
}
|
||||
Err(e) => match e {
|
||||
KvStoreError::KeyNotAvailable(_) => None,
|
||||
_ => return Err(e),
|
||||
},
|
||||
};
|
||||
if let Some(heartbeat) = &last_heartbeat {
|
||||
debug!(
|
||||
"Found existing heartbeat with sequence: {}",
|
||||
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
|
||||
);
|
||||
} else {
|
||||
debug!("No existing heartbeat found, starting fresh");
|
||||
}
|
||||
|
||||
// Cache the last heartbeat for sequence tracking
|
||||
*self.last_heartbeat.write().await = last_heartbeat;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn store_cluster_state(
|
||||
&self,
|
||||
cluster_data: Option<ClusterStateData>,
|
||||
) -> Result<ClusterStateData, KvStoreError> {
|
||||
let key = format!("cluster.{}", self.config.cluster_id);
|
||||
match cluster_data {
|
||||
Some(cluster_data) => {
|
||||
debug!("found some cluster state {:#?}", cluster_data);
|
||||
|
||||
let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
|
||||
KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: format!("{:?}", cluster_data),
|
||||
}
|
||||
})?;
|
||||
|
||||
let expected_sequence = {
|
||||
let last = self.cluster_state.read().await;
|
||||
last.as_ref()
|
||||
.and_then(|hb| hb.metadata.as_ref())
|
||||
.map(|m| m.sequence)
|
||||
.unwrap_or(0)
|
||||
};
|
||||
|
||||
debug!("expected sequence {:#?}", expected_sequence);
|
||||
let new_seq = self
|
||||
.cluster_kv
|
||||
.set_strict(&key, value, expected_sequence)
|
||||
.await?;
|
||||
|
||||
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
|
||||
debug!("cluster kv {:#?}", cluster_kv_result);
|
||||
|
||||
let cluster_data_new = ClusterStateData {
|
||||
cluster_info: cluster_data.cluster_info.clone(),
|
||||
metadata: Some(cluster_kv_result.metadata),
|
||||
};
|
||||
|
||||
*self.cluster_state.write().await = Some(cluster_data_new.clone());
|
||||
Ok(cluster_data)
|
||||
}
|
||||
None => {
|
||||
let cluster_info = ClusterState {
|
||||
cluster_id: self.config.cluster_id.clone(),
|
||||
current_primary: None,
|
||||
desired_primary: self.config.desired_primary_id.clone(),
|
||||
};
|
||||
|
||||
let value = serde_json::to_value(&cluster_info).map_err(|e| {
|
||||
KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: format!("{:?}", cluster_info),
|
||||
}
|
||||
})?;
|
||||
|
||||
let cluster_data = ClusterStateData {
|
||||
cluster_info,
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
|
||||
|
||||
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
|
||||
debug!("cluster kv {:#?}", cluster_kv_result);
|
||||
|
||||
let cluster_data_new = ClusterStateData {
|
||||
cluster_info: cluster_data.cluster_info.clone(),
|
||||
metadata: Some(cluster_kv_result.metadata),
|
||||
};
|
||||
|
||||
*self.cluster_state.write().await = Some(cluster_data_new.clone());
|
||||
Ok(cluster_data_new)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sends agent heartbeat to the KV store
|
||||
///
|
||||
/// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
|
||||
/// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
|
||||
/// comparisons use the store's clock, not agent clocks.
|
||||
///
|
||||
/// This method uses the last successful heartbeat's sequence number to avoid an extra
|
||||
/// GET call before each SET, reducing network round-trips and latency exposure.
|
||||
async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
|
||||
let key = format!("heartbeat.{}", self.config.agent_id);
|
||||
|
||||
// Create agent info WITHOUT timestamp - the store will add metadata
|
||||
// Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
|
||||
let agent_info = AgentInfo {
|
||||
agent_id: self.config.agent_id.clone(),
|
||||
cluster_id: self.config.cluster_id.clone(),
|
||||
status: self.workflow.state_name().to_string(),
|
||||
};
|
||||
|
||||
debug!("Storing heartbeat for agent: {}", self.config.agent_id);
|
||||
let value =
|
||||
serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: format!("{:?}", agent_info),
|
||||
})?;
|
||||
|
||||
let expected_sequence = {
|
||||
let last = self.last_heartbeat.read().await;
|
||||
last.as_ref()
|
||||
.and_then(|hb| hb.metadata.as_ref())
|
||||
.map(|m| m.sequence)
|
||||
.unwrap_or(0)
|
||||
};
|
||||
|
||||
trace!("Writing new heartbeat {key} (#{expected_sequence}), value: {value:?}");
|
||||
let new_seq = self
|
||||
.health_kv
|
||||
.set_strict(&key, value, expected_sequence)
|
||||
.await?;
|
||||
trace!("Got new sequence {new_seq}");
|
||||
let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
|
||||
|
||||
debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
|
||||
|
||||
// Construct complete heartbeat with metadata from store
|
||||
let heartbeat = AgentHeartbeat {
|
||||
agent_info,
|
||||
metadata: Some(kv_result.metadata),
|
||||
};
|
||||
|
||||
// Cache this successful heartbeat for next iteration
|
||||
*self.last_heartbeat.write().await = Some(heartbeat.clone());
|
||||
|
||||
Ok(heartbeat)
|
||||
}
|
||||
|
||||
pub async fn run_heartbeat_loop(&mut self) {
|
||||
let mut next_heartbeat_start;
|
||||
loop {
|
||||
let this_heartbeat_start = Instant::now();
|
||||
next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
|
||||
|
||||
// Perform the check via the config/strategy with a timeout
|
||||
//
|
||||
// FIXME There is too much stuff happening inside the timeout. There are some things like a
|
||||
// promotion, that we don't want to cancel within a single heartbeat interval timeout
|
||||
// I think that the timeout should only apply to the store_heartbeat().await call.
|
||||
// Logic happening after should not be affected in the exact same manner. There can be
|
||||
// other timeouts or other stuff to consider here.
|
||||
// However, the system does rely on heartbeats happening regularly, so we do not want
|
||||
// to delay the next heartbeat either. This is tricky.
|
||||
// An idea right now is to keep the heartbeat running but, when a processing event
|
||||
// occurs, set a flag on the local agent that there is a process running (promotion,
|
||||
// demotion, etc) and take no other decision until this process is not done. There is
|
||||
// one exception we can think of right now :
|
||||
// - a healthy primary starts running a process such as "calling mom"
|
||||
// - the primary keeps sending its heartbeat to prove to the rest of the cluster that
|
||||
// it is still healthy
|
||||
// - then the primary heartbeat fails up to failure_threshold
|
||||
// - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
|
||||
// - Another solution would be register the processing: "calling mom" in the primary
|
||||
// heartbeat store, and prevent the replica from promoting when there is a running
|
||||
// task on the primary.
|
||||
let result = tokio::time::timeout(self.config.heartbeat_interval, async {
|
||||
// Store heartbeat and perform deployment-specific health check
|
||||
match &self.store_heartbeat().await {
|
||||
Ok(heartbeat) => {
|
||||
// Heartbeat stored successfully, already cached by store_heartbeat
|
||||
debug!(
|
||||
"Heartbeat stored: seq={}",
|
||||
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
|
||||
);
|
||||
}
|
||||
Err(KvStoreError::WrongLastRevision) => {
|
||||
todo!("fetch and update correct last sequence number")
|
||||
// CAS failure could indicate:
|
||||
// 1. Network latency: our previous timeout heartbeat actually succeeded
|
||||
// 2. Agent ID conflict: another agent with same ID exists
|
||||
// 3. Clock/bucket corruption (unlikely)
|
||||
|
||||
// log::warn!(
|
||||
// "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
|
||||
// self.config.agent_id, expected, current, current
|
||||
// );
|
||||
// // Update cached heartbeat sequence to prevent repeated failures
|
||||
// if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
|
||||
// if let Some(metadata) = hb.metadata.as_mut() {
|
||||
// metadata.sequence = *current;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
Err(e) => {
|
||||
// Actual storage failure - treat as heartbeat failure
|
||||
log::error!("Heartbeat storage error: {}", e);
|
||||
return Err(HeartbeatFailure {});
|
||||
}
|
||||
}
|
||||
self.config
|
||||
.deployment_config_unstable
|
||||
.perform_heartbeat()
|
||||
.await?;
|
||||
|
||||
// TODO: Pass the heartbeat with metadata to the workflow for staleness checks
|
||||
// The workflow needs access to metadata.timestamp for failover timeout calculations
|
||||
Ok::<(), HeartbeatFailure>(())
|
||||
})
|
||||
.await;
|
||||
|
||||
// Update Counters & Handle State Transitions
|
||||
// Timeout is also treated as a failure
|
||||
let heartbeat_result = match result {
|
||||
Ok(inner_result) => inner_result,
|
||||
Err(_) => Err(HeartbeatFailure {}),
|
||||
};
|
||||
|
||||
trace!("Got heartbeat_result : {heartbeat_result:?}");
|
||||
match heartbeat_result {
|
||||
Ok(_) => {
|
||||
let new_state = self
|
||||
.workflow
|
||||
.handle_heartbeat_success(
|
||||
self.cluster_state.read().await.as_ref(),
|
||||
&self.config,
|
||||
)
|
||||
.await;
|
||||
if let Some(new_state) = new_state {
|
||||
warn!("Got new cluster state : {new_state:#?}");
|
||||
self.store_cluster_state(Some(new_state))
|
||||
.await
|
||||
.expect(&format!("cluster state not able to be stored"));
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
self.workflow
|
||||
.handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
|
||||
success_threshold = self.config.success_threshold,
|
||||
failure_threshold = self.config.failure_threshold,
|
||||
state = self.workflow.state_name(),
|
||||
consecutive_successes = self.workflow.consecutive_successes(),
|
||||
consecutive_failures = self.workflow.consecutive_failures(),
|
||||
heartbeat_emoji = if heartbeat_result.is_ok() {
|
||||
"✅"
|
||||
} else {
|
||||
"❌"
|
||||
},
|
||||
heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
|
||||
);
|
||||
debug!(
|
||||
"Sleeping for {} ms before next heartbeat",
|
||||
(next_heartbeat_start - Instant::now()).as_millis()
|
||||
);
|
||||
tokio::time::sleep_until(next_heartbeat_start).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
17
harmony_agent/src/agent/role.rs
Normal file
17
harmony_agent/src/agent/role.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
use std::fmt;
|
||||
|
||||
/// The role of this agent instance
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum AgentRole {
|
||||
Primary,
|
||||
Replica,
|
||||
}
|
||||
|
||||
impl fmt::Display for AgentRole {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
AgentRole::Primary => write!(f, "primary"),
|
||||
AgentRole::Replica => write!(f, "replica"),
|
||||
}
|
||||
}
|
||||
}
|
||||
90
harmony_agent/src/config.rs
Normal file
90
harmony_agent/src/config.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use harmony_types::id::Id;
|
||||
use log::debug;
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Configuration for the Harmony Agent
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AgentConfig {
|
||||
pub nats_url: String,
|
||||
pub nats_creds_path: Option<String>,
|
||||
pub my_cluster_id: Id,
|
||||
pub desired_primary: Id,
|
||||
pub heartbeat_interval: Duration,
|
||||
}
|
||||
|
||||
pub const NATS_URL: &str = "NATS_URL";
|
||||
pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
|
||||
pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
|
||||
pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
|
||||
|
||||
impl AgentConfig {
|
||||
pub fn load_from_env() -> Result<Self, String> {
|
||||
let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
|
||||
|
||||
// Validate NATS URL is not empty
|
||||
if nats_url.is_empty() {
|
||||
return Err(format!("{NATS_URL} cannot be empty"));
|
||||
}
|
||||
|
||||
// Validate NATS URL format
|
||||
if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
|
||||
return Err(format!(
|
||||
"Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
|
||||
nats_url
|
||||
));
|
||||
}
|
||||
|
||||
let nats_creds_path = env::var(NATS_CREDS_PATH)
|
||||
.ok()
|
||||
.filter(|creds_path| !creds_path.is_empty());
|
||||
|
||||
// Validate NATS creds path if provided
|
||||
if let Some(creds_path) = &nats_creds_path {
|
||||
debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
|
||||
let path = Path::new(creds_path);
|
||||
if !path.exists() {
|
||||
return Err(format!(
|
||||
"NATS credentials file does not exist: {}",
|
||||
creds_path
|
||||
));
|
||||
}
|
||||
if !path.is_file() {
|
||||
return Err(format!(
|
||||
"NATS credentials path is not a file: {}",
|
||||
creds_path
|
||||
));
|
||||
}
|
||||
// Check if file is readable by attempting to read metadata
|
||||
if std::fs::metadata(path).is_err() {
|
||||
return Err(format!(
|
||||
"NATS credentials file is not readable: {}",
|
||||
creds_path
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let my_cluster_id_str = env::var(MY_CLUSTER_ID)
|
||||
.map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
|
||||
|
||||
if my_cluster_id_str.is_empty() {
|
||||
return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
|
||||
}
|
||||
|
||||
let desired_primary_str = env::var(DESIRED_PRIMARY)
|
||||
.map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
|
||||
|
||||
if desired_primary_str.is_empty() {
|
||||
return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
nats_url,
|
||||
nats_creds_path,
|
||||
my_cluster_id: my_cluster_id_str.into(),
|
||||
desired_primary: desired_primary_str.into(),
|
||||
heartbeat_interval: Duration::from_millis(1000),
|
||||
})
|
||||
}
|
||||
}
|
||||
82
harmony_agent/src/main.rs
Normal file
82
harmony_agent/src/main.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use crate::{
|
||||
agent::AgentRole,
|
||||
store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
|
||||
};
|
||||
|
||||
// mod agent_loop;
|
||||
mod agent;
|
||||
pub mod store;
|
||||
mod workflow;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
env_logger::init();
|
||||
|
||||
let heartbeat_interval = Duration::from_millis(2000);
|
||||
let failover_timeout = Duration::from_secs(10);
|
||||
|
||||
// let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
|
||||
|
||||
let nats_store = get_local_nats_store().await;
|
||||
let health_kv = nats_store.clone();
|
||||
let cluster_kv = nats_store.clone();
|
||||
|
||||
let _ = tokio::join!(
|
||||
agent::launch_agent(
|
||||
AgentRole::Primary,
|
||||
health_kv.clone(),
|
||||
cluster_kv.clone(),
|
||||
heartbeat_interval,
|
||||
failover_timeout
|
||||
),
|
||||
agent::launch_agent(
|
||||
AgentRole::Replica,
|
||||
health_kv,
|
||||
cluster_kv,
|
||||
heartbeat_interval,
|
||||
failover_timeout
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
fn get_chaos_store(
|
||||
heartbeat_interval: &Duration,
|
||||
failover_timeout: &Duration,
|
||||
) -> (
|
||||
Arc<ChaosKvStore<InMemoryKvStore>>,
|
||||
Arc<ChaosKvStore<InMemoryKvStore>>,
|
||||
) {
|
||||
let health_kv = Arc::new(ChaosKvStore::new(
|
||||
InMemoryKvStore::new(),
|
||||
10,
|
||||
10,
|
||||
heartbeat_interval.as_millis().try_into().unwrap(),
|
||||
));
|
||||
let cluster_kv = Arc::new(ChaosKvStore::new(
|
||||
InMemoryKvStore::new(),
|
||||
5,
|
||||
5,
|
||||
failover_timeout.as_millis().try_into().unwrap(),
|
||||
));
|
||||
|
||||
(health_kv, cluster_kv)
|
||||
}
|
||||
|
||||
async fn get_local_nats_store() -> Arc<NatsKvStore> {
|
||||
let client = async_nats::connect("localhost").await.unwrap();
|
||||
let jetstream = async_nats::jetstream::new(client);
|
||||
let kv = jetstream
|
||||
.create_key_value(async_nats::jetstream::kv::Config {
|
||||
bucket: "kv".to_string(),
|
||||
history: 10,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let status = kv.status().await.unwrap();
|
||||
println!("status: {:?}", status);
|
||||
|
||||
Arc::new(NatsKvStore::new(kv))
|
||||
}
|
||||
142
harmony_agent/src/store/chaos.rs
Normal file
142
harmony_agent/src/store/chaos.rs
Normal file
@@ -0,0 +1,142 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, trace, warn};
|
||||
use serde_json::Value;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Duration;
|
||||
|
||||
use crate::store::SubscriptionCallback;
|
||||
|
||||
use super::{KvStore, KvStoreError};
|
||||
|
||||
/// A chaos testing KV store that randomly times out or fails
|
||||
/// Wraps another KvStore implementation and adds random failures
|
||||
#[derive(Clone)]
|
||||
pub struct ChaosKvStore<T: KvStore> {
|
||||
inner: Arc<T>,
|
||||
timeout_probability_percent: u32,
|
||||
failure_probability_percent: u32,
|
||||
max_delay_ms: u64,
|
||||
}
|
||||
|
||||
impl<T: KvStore> ChaosKvStore<T> {
|
||||
pub fn new(
|
||||
inner: T,
|
||||
timeout_probability_percent: u32,
|
||||
failure_probability_percent: u32,
|
||||
max_delay_ms: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: Arc::new(inner),
|
||||
timeout_probability_percent,
|
||||
failure_probability_percent,
|
||||
max_delay_ms,
|
||||
}
|
||||
}
|
||||
|
||||
async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
|
||||
trace!("Calculating chaos");
|
||||
// Random delay
|
||||
let delay = getrandom::u64().unwrap() % self.max_delay_ms;
|
||||
let delay = Duration::from_millis(delay);
|
||||
trace!("Sleeping until chaos maybe happens {delay:?}");
|
||||
tokio::time::sleep(delay).await;
|
||||
|
||||
// Random failure
|
||||
let failure_random = getrandom::u32().unwrap() % 100;
|
||||
if failure_random < self.failure_probability_percent {
|
||||
warn!(
|
||||
"Chaos causes an error : {failure_random} < {}",
|
||||
self.failure_probability_percent
|
||||
);
|
||||
return Err(KvStoreError::Unknown(format!(
|
||||
"Randomly failed thanks to chaos store with {}% chances, got {}",
|
||||
self.failure_probability_percent, failure_random
|
||||
)));
|
||||
}
|
||||
|
||||
// Random timeout (simulated as a very long delay)
|
||||
let failure_random = getrandom::u32().unwrap() % 100;
|
||||
if failure_random < self.timeout_probability_percent {
|
||||
warn!(
|
||||
"Chaos caused a timeout : {failure_random} < {}",
|
||||
self.failure_probability_percent
|
||||
);
|
||||
tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
|
||||
async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
|
||||
self.maybe_chaos().await?;
|
||||
self.inner.get(key).await
|
||||
}
|
||||
|
||||
async fn get_revision(
|
||||
&self,
|
||||
key: &str,
|
||||
expected_seq: u64,
|
||||
) -> Result<super::KvResult, KvStoreError> {
|
||||
self.maybe_chaos().await?;
|
||||
self.inner.get_revision(key, expected_seq).await
|
||||
}
|
||||
|
||||
async fn set_strict(
|
||||
&self,
|
||||
key: &str,
|
||||
value: Value,
|
||||
expected_sequence: u64,
|
||||
) -> Result<u64, KvStoreError> {
|
||||
self.maybe_chaos().await?;
|
||||
self.inner.set_strict(key, value, expected_sequence).await
|
||||
}
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
key: &str,
|
||||
callback: SubscriptionCallback,
|
||||
) -> Result<(), KvStoreError> {
|
||||
self.maybe_chaos().await?;
|
||||
self.inner.subscribe(key, callback).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::store::InMemoryKvStore;
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chaos_store_with_no_chaos() {
|
||||
let inner = InMemoryKvStore::new();
|
||||
let chaos = ChaosKvStore::new(inner, 0, 0, 1);
|
||||
|
||||
let value = json!({"test": "value"});
|
||||
let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
|
||||
assert_eq!(result, 1);
|
||||
|
||||
let retrieved = chaos.get("key").await.unwrap();
|
||||
assert_eq!(retrieved.value, Some(value));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chaos_store_with_delay() {
|
||||
let inner = InMemoryKvStore::new();
|
||||
let chaos = ChaosKvStore::new(inner, 0, 0, 100);
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
let value = json!({"test": "value"});
|
||||
chaos.set_strict("key", value, 0).await.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
// Should have some delay
|
||||
assert!(
|
||||
elapsed.as_millis() < 150,
|
||||
"Should complete within reasonable time"
|
||||
);
|
||||
}
|
||||
}
|
||||
196
harmony_agent/src/store/memory.rs
Normal file
196
harmony_agent/src/store/memory.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, trace};
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::store::SubscriptionCallback;
|
||||
|
||||
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
|
||||
|
||||
/// An in-memory KV store that guarantees ordering like NATS JetStream
|
||||
/// Each key maintains a full history of all writes, where the sequence number
|
||||
/// is the length of the history (1-indexed)
|
||||
#[derive(Clone)]
|
||||
pub struct InMemoryKvStore {
|
||||
data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
|
||||
}
|
||||
|
||||
impl InMemoryKvStore {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
data: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the latest sequence number for a key (length of history)
|
||||
pub async fn get_seq(&self, key: &str) -> Option<u64> {
|
||||
self.data.read().await.get(key).map(|vec| vec.len() as u64)
|
||||
}
|
||||
|
||||
/// Get the value at a specific revision for a key
|
||||
pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
|
||||
let data = self.data.read().await;
|
||||
let entries = data
|
||||
.get(key)
|
||||
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
|
||||
|
||||
// Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
|
||||
if seq == 0 || seq > entries.len() as u64 {
|
||||
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||
}
|
||||
|
||||
let (value, timestamp) = entries[seq as usize - 1].clone();
|
||||
|
||||
Ok(KvResult {
|
||||
value: Some(value.clone()),
|
||||
metadata: KvMetadata {
|
||||
timestamp,
|
||||
sequence: seq,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InMemoryKvStore {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl KvStore for InMemoryKvStore {
|
||||
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
|
||||
self.get_revision(key, expected_seq).await
|
||||
}
|
||||
|
||||
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
|
||||
let data = self.data.read().await;
|
||||
let entries = data
|
||||
.get(key)
|
||||
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
|
||||
|
||||
let (value, timestamp) = entries.last().unwrap();
|
||||
|
||||
Ok(KvResult {
|
||||
value: Some(value.clone()),
|
||||
metadata: KvMetadata {
|
||||
timestamp: *timestamp,
|
||||
sequence: entries.len() as u64,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
async fn set_strict(
|
||||
&self,
|
||||
key: &str,
|
||||
value: Value,
|
||||
expected_sequence: u64,
|
||||
) -> Result<u64, KvStoreError> {
|
||||
// Check current sequence (length of history for this key)
|
||||
let data = self.data.read().await;
|
||||
// This implemenetation does not seem to match the NATS sequence. In nats the
|
||||
// sequence updates one counter per bucket. This impl creates a counter per key
|
||||
let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
|
||||
drop(data);
|
||||
|
||||
// Verify expected sequence matches
|
||||
if current_sequence != expected_sequence {
|
||||
trace!("{current_sequence} != {expected_sequence}");
|
||||
return Err(KvStoreError::WrongLastRevision);
|
||||
}
|
||||
|
||||
// Get current timestamp
|
||||
let timestamp = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("Time went backwards")
|
||||
.as_millis() as u64;
|
||||
|
||||
// Append to the history
|
||||
let mut data = self.data.write().await;
|
||||
data.entry(key.to_string())
|
||||
.or_insert_with(Vec::new)
|
||||
.push((value.clone(), timestamp));
|
||||
|
||||
let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
|
||||
|
||||
debug!(
|
||||
"Successfully inserted {key}(rev#{new_seq}) : {value}",
|
||||
value = value.to_string()
|
||||
);
|
||||
|
||||
Ok(new_seq)
|
||||
}
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
key: &str,
|
||||
callback: SubscriptionCallback,
|
||||
) -> Result<(), KvStoreError> {
|
||||
// For now, subscribe just returns the current value
|
||||
// In a real implementation, this would return a stream of updates
|
||||
self.get(key).await;
|
||||
todo!() // register callback and call it when key is set ?
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_memory_store_basic() {
|
||||
let store = InMemoryKvStore::new();
|
||||
|
||||
// Set a value
|
||||
let value = json!({"status": "healthy"});
|
||||
let result = store
|
||||
.set_strict("test_key", value.clone(), 0)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result, 1);
|
||||
|
||||
// Get the value
|
||||
let retrieved = store.get("test_key").await.unwrap();
|
||||
assert_eq!(retrieved.value, Some(value));
|
||||
assert_eq!(retrieved.metadata.sequence, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_memory_store_sequence_numbers() {
|
||||
let store = InMemoryKvStore::new();
|
||||
|
||||
let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
|
||||
|
||||
let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
|
||||
|
||||
assert!(seq2 > seq1, "Sequence numbers should increment");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_memory_store_key_not_found() {
|
||||
let store = InMemoryKvStore::new();
|
||||
let result = store.get("nonexistent").await;
|
||||
assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_memory_store_strict_ordering() {
|
||||
let store = InMemoryKvStore::new();
|
||||
|
||||
// First write with sequence 0
|
||||
let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
|
||||
assert_eq!(result1, 1);
|
||||
|
||||
// Second write with correct sequence
|
||||
let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
|
||||
assert_eq!(result2, 2);
|
||||
|
||||
// Third write with wrong sequence should fail
|
||||
let result3 = store.set_strict("key", json!("value3"), 1).await;
|
||||
assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
|
||||
}
|
||||
}
|
||||
120
harmony_agent/src/store/mod.rs
Normal file
120
harmony_agent/src/store/mod.rs
Normal file
@@ -0,0 +1,120 @@
|
||||
use async_trait::async_trait;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Handle for managing active subscriptions
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SubscriptionHandle {
|
||||
id: usize,
|
||||
_phantom: std::marker::PhantomData<()>,
|
||||
}
|
||||
|
||||
/// Metadata returned by the KV store for all operations
|
||||
/// Contains timing and ordering information set by the store
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct KvMetadata {
|
||||
/// Timestamp set by the store (milliseconds since UNIX epoch)
|
||||
pub timestamp: u64,
|
||||
/// Sequence number for strict ordering guarantees
|
||||
pub sequence: u64,
|
||||
}
|
||||
|
||||
/// Result returned by KV store operations
|
||||
/// Contains both the value (if any) and store metadata
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KvResult {
|
||||
/// The value from the store (None if key doesn't exist)
|
||||
pub value: Option<Value>,
|
||||
/// Store-provided metadata (timestamp, sequence)
|
||||
pub metadata: KvMetadata,
|
||||
}
|
||||
|
||||
/// Callback type for subscription updates
|
||||
/// Callback receives: key, new value (None if deleted), and metadata
|
||||
pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum KvStoreError {
|
||||
#[error("data store disconnected")]
|
||||
Disconnect(#[from] std::io::Error),
|
||||
#[error("invalid key")]
|
||||
InvalidKey,
|
||||
#[error("operation timed out")]
|
||||
Timeout,
|
||||
#[error("the data for key `{0}` is not available")]
|
||||
KeyNotAvailable(String),
|
||||
#[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
|
||||
DeserializationFailed {
|
||||
deserialization_error: String,
|
||||
value: String,
|
||||
},
|
||||
#[error("Strict ordering violation, wrong last sequence number")]
|
||||
WrongLastRevision,
|
||||
#[error("unknown data store error {0}")]
|
||||
Unknown(String),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait KvStore {
|
||||
/// Get a value from the store
|
||||
///
|
||||
/// # Returns
|
||||
/// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
|
||||
/// - `Err(KeyNotAvailable)`: If the key doesn't exist
|
||||
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
|
||||
|
||||
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
|
||||
|
||||
/// Strict set operation with compare-and-set semantics
|
||||
///
|
||||
/// Sets the value only if the current sequence number matches `expected_sequence`.
|
||||
/// This provides strict ordering guarantees needed for the failover algorithm.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `key`: The key to set
|
||||
/// - `value`: The value to store
|
||||
/// - `expected_sequence`: The sequence number we expect the key to currently have.
|
||||
/// Use 0 for the first write to a new key.
|
||||
///
|
||||
/// # Returns
|
||||
/// - `Ok(u64)`: Returns the new sequence number
|
||||
/// - `Err(KvStoreError)`: If another write happened (current != expected)
|
||||
///
|
||||
/// # Example Use Case
|
||||
/// For NATS JetStream, this maps to the conditional update operation that ensures
|
||||
/// only one agent can successfully promote to primary.
|
||||
async fn set_strict(
|
||||
&self,
|
||||
key: &str,
|
||||
value: Value,
|
||||
expected_sequence: u64,
|
||||
) -> Result<u64, KvStoreError>;
|
||||
|
||||
/// Subscribe to updates for a key
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `key`: The key to subscribe to
|
||||
/// - `callback`: Function to call on each update with key, value, and metadata
|
||||
///
|
||||
/// # Returns
|
||||
/// - `Ok(())`: Subscription established successfully
|
||||
/// - `Err(KvStoreError)`: Subscription failed
|
||||
///
|
||||
/// Note: For JetStream, this should use watch() API. Updates will invoke the callback
|
||||
/// asynchronously in the background.
|
||||
async fn subscribe(
|
||||
&self,
|
||||
key: &str,
|
||||
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
|
||||
// callback
|
||||
) -> Result<(), KvStoreError>;
|
||||
}
|
||||
|
||||
mod chaos;
|
||||
mod memory;
|
||||
mod nats;
|
||||
|
||||
pub use chaos::ChaosKvStore;
|
||||
pub use memory::InMemoryKvStore;
|
||||
pub use nats::NatsKvStore;
|
||||
179
harmony_agent/src/store/nats.rs
Normal file
179
harmony_agent/src/store/nats.rs
Normal file
@@ -0,0 +1,179 @@
|
||||
use async_nats::jetstream::kv::{Store, UpdateError};
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, error, trace};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::store::SubscriptionCallback;
|
||||
|
||||
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
|
||||
|
||||
/// NATS JetStream-backed KV store
|
||||
pub struct NatsKvStore {
|
||||
store: Store,
|
||||
}
|
||||
|
||||
impl NatsKvStore {
|
||||
pub fn new(store: Store) -> Self {
|
||||
Self { store }
|
||||
}
|
||||
|
||||
pub async fn create(
|
||||
client: async_nats::Client,
|
||||
bucket_name: &str,
|
||||
history_size: i64,
|
||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
let jetstream = async_nats::jetstream::new(client);
|
||||
|
||||
debug!("Creating NATS KV bucket: {}", bucket_name);
|
||||
let store = jetstream
|
||||
.create_key_value(async_nats::jetstream::kv::Config {
|
||||
bucket: bucket_name.to_string(),
|
||||
history: history_size,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
"Failed to initialize NATS KV bucket '{}': {}",
|
||||
bucket_name, e
|
||||
);
|
||||
e
|
||||
})?;
|
||||
|
||||
Ok(Self::new(store))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl KvStore for NatsKvStore {
|
||||
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
|
||||
let entry = self
|
||||
.store
|
||||
.entry_for_revision(key, expected_seq)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!("NATS get failed for key '{}': {}", key, e);
|
||||
KvStoreError::Disconnect(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
if entry.is_none() {
|
||||
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||
}
|
||||
|
||||
let entry = entry.unwrap();
|
||||
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
|
||||
KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: String::from_utf8_lossy(&entry.value).to_string(),
|
||||
}
|
||||
})?;
|
||||
|
||||
// Extract metadata from NATS entry
|
||||
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
|
||||
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
|
||||
|
||||
let metadata = KvMetadata {
|
||||
timestamp,
|
||||
sequence: entry.revision,
|
||||
};
|
||||
|
||||
Ok(KvResult {
|
||||
value: Some(value),
|
||||
metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
|
||||
let entry = self.store.entry(key).await.map_err(|e| {
|
||||
error!("NATS get failed for key '{}': {}", key, e);
|
||||
KvStoreError::Disconnect(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
))
|
||||
})?;
|
||||
|
||||
if entry.is_none() {
|
||||
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||
}
|
||||
|
||||
let entry = entry.unwrap();
|
||||
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
|
||||
KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: String::from_utf8_lossy(&entry.value).to_string(),
|
||||
}
|
||||
})?;
|
||||
|
||||
// Extract metadata from NATS entry
|
||||
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
|
||||
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
|
||||
|
||||
let metadata = KvMetadata {
|
||||
timestamp,
|
||||
sequence: entry.revision,
|
||||
};
|
||||
|
||||
Ok(KvResult {
|
||||
value: Some(value),
|
||||
metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn set_strict(
|
||||
&self,
|
||||
key: &str,
|
||||
value: Value,
|
||||
expected_sequence: u64,
|
||||
) -> Result<u64, KvStoreError> {
|
||||
trace!(
|
||||
"Nats set strict {key} (#{expected_sequence}) : {}",
|
||||
value.to_string()
|
||||
);
|
||||
let bytes =
|
||||
serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
|
||||
deserialization_error: e.to_string(),
|
||||
value: value.to_string(),
|
||||
})?;
|
||||
|
||||
// Use update() for CAS semantics (Compare-And-Set)
|
||||
// This ensures we only write if the revision matches expected_sequence
|
||||
let revision = self
|
||||
.store
|
||||
.update(&key, bytes.into(), expected_sequence)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
// FIXME this is ugly, we should have a clean KvStoreError containing
|
||||
// proper information from nats instead
|
||||
error!("NATS update failed for key '{}': {}", key, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
Ok(revision)
|
||||
}
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
key: &str,
|
||||
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
|
||||
) -> Result<(), KvStoreError> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<UpdateError> for KvStoreError {
|
||||
fn from(value: UpdateError) -> Self {
|
||||
match value.kind() {
|
||||
async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
|
||||
async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
|
||||
async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
|
||||
KvStoreError::WrongLastRevision
|
||||
}
|
||||
async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
39
harmony_agent/src/workflow/mod.rs
Normal file
39
harmony_agent/src/workflow/mod.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::agent::AgentConfig;
|
||||
use async_trait::async_trait;
|
||||
|
||||
pub mod primary;
|
||||
pub mod replica;
|
||||
|
||||
/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
|
||||
#[async_trait]
|
||||
pub trait HeartbeatWorkflow: Send + Sync {
|
||||
/// Handle a successful heartbeat
|
||||
async fn handle_heartbeat_success(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
) -> Option<crate::agent::ClusterStateData>;
|
||||
|
||||
/// Handle a failed heartbeat
|
||||
async fn handle_heartbeat_failure(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
);
|
||||
|
||||
async fn on_startup(
|
||||
&self,
|
||||
cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
);
|
||||
|
||||
/// Get the current state name for logging (also used for heartbeat status)
|
||||
fn state_name(&self) -> &'static str;
|
||||
|
||||
/// Get current consecutive successes
|
||||
fn consecutive_successes(&self) -> usize;
|
||||
|
||||
/// Get current consecutive failures
|
||||
fn consecutive_failures(&self) -> usize;
|
||||
}
|
||||
330
harmony_agent/src/workflow/primary.rs
Normal file
330
harmony_agent/src/workflow/primary.rs
Normal file
@@ -0,0 +1,330 @@
|
||||
use async_trait::async_trait;
|
||||
use log::{debug, info, trace, warn};
|
||||
|
||||
use crate::{
|
||||
agent::{AgentConfig, DeploymentConfig},
|
||||
workflow::HeartbeatWorkflow,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum PrimaryState {
|
||||
Initializing,
|
||||
Healthy,
|
||||
Failed,
|
||||
Fenced,
|
||||
Yielding,
|
||||
}
|
||||
|
||||
impl PrimaryState {
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
PrimaryState::Initializing => "Primary:Initializing",
|
||||
PrimaryState::Healthy => "Primary:Healthy",
|
||||
PrimaryState::Failed => "Primary:Failed",
|
||||
PrimaryState::Fenced => "Primary:Fenced",
|
||||
PrimaryState::Yielding => "Primary:Yielding",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PrimaryWorkflow {
|
||||
state: PrimaryState,
|
||||
consecutive_successes: usize,
|
||||
consecutive_failures: usize,
|
||||
|
||||
// TODO these thresholds should not be copied into the workflow struct. They are configuration
|
||||
// level and should always be read from the context passed to the workflow functions
|
||||
success_threshold: usize,
|
||||
failure_threshold: usize,
|
||||
|
||||
// TODO not sure if this should be known by the workflow or passed in the context to function
|
||||
// calls or just completely handled by the agent ?
|
||||
deployment_config: DeploymentConfig,
|
||||
}
|
||||
|
||||
impl PrimaryWorkflow {
|
||||
pub fn new(
|
||||
success_threshold: usize,
|
||||
failure_threshold: usize,
|
||||
deployment_config: DeploymentConfig,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: PrimaryState::Initializing,
|
||||
consecutive_successes: 0,
|
||||
consecutive_failures: 0,
|
||||
success_threshold,
|
||||
failure_threshold,
|
||||
deployment_config,
|
||||
}
|
||||
}
|
||||
|
||||
fn transition_to(&mut self, new_state: PrimaryState) {
|
||||
if self.state != new_state {
|
||||
info!(
|
||||
"State transition: {} -> {}",
|
||||
self.state.name(),
|
||||
new_state.name()
|
||||
);
|
||||
self.state = new_state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl HeartbeatWorkflow for PrimaryWorkflow {
|
||||
async fn on_startup(
|
||||
&self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
) {
|
||||
if let Some(state) = cluster_state {
|
||||
info!(
|
||||
"Startup reconciliation: current primary is {:?}, desired primary is {:?}",
|
||||
state.cluster_info.current_primary, state.cluster_info.desired_primary
|
||||
);
|
||||
|
||||
// No automatic fast-tracking - agent must earn healthy status
|
||||
// through successful heartbeats. This prevents duplicate agents
|
||||
// or crashloop agents from incorrectly claiming primary.
|
||||
} else {
|
||||
debug!("No cluster state on startup, starting from Initializing");
|
||||
}
|
||||
}
|
||||
async fn handle_heartbeat_success(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
) -> Option<crate::agent::ClusterStateData> {
|
||||
trace!(
|
||||
"Handling heartbeat success, current counters success {} failures {}",
|
||||
self.consecutive_successes, self.consecutive_failures
|
||||
);
|
||||
self.consecutive_successes += 1;
|
||||
self.consecutive_failures = 0;
|
||||
|
||||
match self.state {
|
||||
PrimaryState::Initializing => {
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
self.transition_to(PrimaryState::Healthy);
|
||||
// Trigger on_active callback
|
||||
let config = self.deployment_config.clone();
|
||||
tokio::spawn(async move {
|
||||
config.on_active().await;
|
||||
});
|
||||
if let Some(state) = cluster_state
|
||||
&& state.cluster_info.desired_primary == agent_config.desired_primary_id
|
||||
{
|
||||
debug!("state {:#?}", state);
|
||||
let mut new_state = state.clone();
|
||||
new_state.cluster_info.current_primary =
|
||||
Some(agent_config.agent_id.clone());
|
||||
return Some(new_state);
|
||||
} else {
|
||||
todo!(
|
||||
"I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
|
||||
);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
PrimaryState::Failed => {
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
self.transition_to(PrimaryState::Healthy);
|
||||
let config = self.deployment_config.clone();
|
||||
tokio::spawn(async move {
|
||||
config.on_active().await;
|
||||
});
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
PrimaryState::Healthy => {
|
||||
// Stay healthy
|
||||
debug!("Primary staying healthy");
|
||||
None
|
||||
}
|
||||
PrimaryState::Fenced => {
|
||||
// Recovery from fenced state
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
// TODO: Check NATS for current_primary status before recovering
|
||||
info!("Recovered from fenced state, transitioning to yielding");
|
||||
self.transition_to(PrimaryState::Yielding);
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
PrimaryState::Yielding => {
|
||||
// TODO: Check NATS to see if we can resume as primary
|
||||
trace!("Yielding, waiting for demotion handshake");
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_heartbeat_failure(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
) {
|
||||
self.consecutive_failures += 1;
|
||||
self.consecutive_successes = 0;
|
||||
|
||||
match self.state {
|
||||
PrimaryState::Healthy => {
|
||||
if self.consecutive_failures >= self.failure_threshold {
|
||||
warn!(
|
||||
"Failure threshold reached ({}/{}), transitioning to Failed",
|
||||
self.consecutive_failures, self.failure_threshold
|
||||
);
|
||||
self.transition_to(PrimaryState::Failed);
|
||||
|
||||
// Immediately fence
|
||||
self.transition_to(PrimaryState::Fenced);
|
||||
let config = self.deployment_config.clone();
|
||||
tokio::spawn(async move {
|
||||
config.on_failover().await;
|
||||
});
|
||||
}
|
||||
}
|
||||
PrimaryState::Initializing => {
|
||||
// Stay in initializing, just accumulate failures
|
||||
trace!("Heartbeat failed during initialization");
|
||||
}
|
||||
PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
|
||||
// Already in a degraded state
|
||||
trace!("Heartbeat failed in degraded state: {}", self.state.name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn state_name(&self) -> &'static str {
|
||||
self.state.name()
|
||||
}
|
||||
|
||||
fn consecutive_successes(&self) -> usize {
|
||||
self.consecutive_successes
|
||||
}
|
||||
|
||||
fn consecutive_failures(&self) -> usize {
|
||||
self.consecutive_failures
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use harmony_types::id::Id;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::agent::{AgentRole, FailoverCNPGConfig};
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
|
||||
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
|
||||
|
||||
assert!(
|
||||
primary
|
||||
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
|
||||
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
|
||||
|
||||
let mut expected_state = cluster_state.clone();
|
||||
expected_state.cluster_info.current_primary = Some(Id::empty());
|
||||
|
||||
assert_eq!(
|
||||
primary
|
||||
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||
.await,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
primary
|
||||
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||
.await,
|
||||
Some(expected_state)
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn primary_stays_healthy_below_failure_threshold() {
|
||||
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
|
||||
|
||||
// Reach healthy
|
||||
let _ = primary
|
||||
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||
.await;
|
||||
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||
|
||||
// One failure below threshold
|
||||
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||
assert_eq!(primary.consecutive_failures(), 1);
|
||||
assert_eq!(primary.consecutive_successes(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn primary_transitions_to_failed_at_failure_threshold() {
|
||||
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
|
||||
|
||||
// Reach healthy
|
||||
let _ = primary
|
||||
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||
.await;
|
||||
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||
|
||||
// First failure, still healthy
|
||||
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||
assert_eq!(primary.consecutive_failures(), 1);
|
||||
|
||||
// Second failure reaches threshold, transitions to Failed
|
||||
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||
assert_eq!(primary.state, PrimaryState::Fenced);
|
||||
assert_eq!(primary.consecutive_failures(), 2);
|
||||
assert_eq!(primary.consecutive_successes(), 0);
|
||||
}
|
||||
|
||||
fn default_test_state(
|
||||
success_threshold: usize,
|
||||
failure_threshold: usize,
|
||||
) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
|
||||
let cluster_state = crate::agent::ClusterStateData {
|
||||
cluster_info: crate::agent::heartbeat::ClusterState {
|
||||
cluster_id: Id::empty(),
|
||||
current_primary: None,
|
||||
desired_primary: Id::empty(),
|
||||
},
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let agent_config = AgentConfig {
|
||||
success_threshold,
|
||||
failure_threshold,
|
||||
heartbeat_interval: Duration::from_nanos(0),
|
||||
failover_timeout: Duration::from_nanos(0),
|
||||
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
|
||||
cnpg_cluster_name: "test".to_string(),
|
||||
}),
|
||||
nats_url: String::new(),
|
||||
nats_creds_path: None,
|
||||
agent_id: Id::empty(),
|
||||
cluster_id: Id::empty(),
|
||||
desired_primary_id: Id::empty(),
|
||||
role: AgentRole::Primary,
|
||||
};
|
||||
|
||||
let primary = PrimaryWorkflow::new(
|
||||
agent_config.success_threshold,
|
||||
agent_config.failure_threshold,
|
||||
agent_config.deployment_config_unstable.clone(),
|
||||
);
|
||||
|
||||
(primary, cluster_state, agent_config)
|
||||
}
|
||||
}
|
||||
279
harmony_agent/src/workflow/replica.rs
Normal file
279
harmony_agent/src/workflow/replica.rs
Normal file
@@ -0,0 +1,279 @@
|
||||
use async_trait::async_trait;
|
||||
use harmony_types::id::Id;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use std::time::Duration;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::agent::{AgentConfig, AgentHeartbeat};
|
||||
use crate::workflow::HeartbeatWorkflow;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HeartbeatState {
|
||||
pub agent_id: Id,
|
||||
pub last_seq: Option<u64>,
|
||||
}
|
||||
|
||||
impl HeartbeatState {
|
||||
pub fn watch(agent_id: Id) -> Self {
|
||||
Self {
|
||||
agent_id,
|
||||
last_seq: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClusterState {
|
||||
pub cluster_id: Id,
|
||||
pub current_primary: Option<Id>,
|
||||
}
|
||||
|
||||
impl ClusterState {
|
||||
pub fn watch(cluster_id: Id) -> Self {
|
||||
Self {
|
||||
cluster_id,
|
||||
current_primary: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum ReplicaState {
|
||||
Initializing,
|
||||
Watching,
|
||||
Promoting,
|
||||
PromotionFailed,
|
||||
Leader,
|
||||
Demoting,
|
||||
Failed,
|
||||
}
|
||||
|
||||
impl ReplicaState {
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
ReplicaState::Initializing => "Replica:Initializing",
|
||||
ReplicaState::Watching => "Replica:Watching",
|
||||
ReplicaState::Promoting => "Replica:Promoting",
|
||||
ReplicaState::PromotionFailed => "Replica:PromotionFailed",
|
||||
ReplicaState::Leader => "Replica:Leader",
|
||||
ReplicaState::Demoting => "Replica:Demoting",
|
||||
ReplicaState::Failed => "Replica:Failed",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ReplicaWorkflow {
|
||||
state: ReplicaState,
|
||||
heartbeat_state: HeartbeatState,
|
||||
primary_state: HeartbeatState,
|
||||
cluster_state: ClusterState,
|
||||
consecutive_successes: usize,
|
||||
consecutive_failures: usize,
|
||||
success_threshold: usize,
|
||||
failure_threshold: usize,
|
||||
failover_timeout: Duration,
|
||||
/// Our own last heartbeat (for timestamp comparison against primary)
|
||||
last_my_heartbeat: Option<AgentHeartbeat>,
|
||||
/// Last observed primary heartbeat (metadata only, for staleness detection)
|
||||
last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
|
||||
}
|
||||
|
||||
impl ReplicaWorkflow {
|
||||
pub fn new(
|
||||
success_threshold: usize,
|
||||
failure_threshold: usize,
|
||||
cluster_id: Id,
|
||||
primary_id: Id,
|
||||
my_id: Id,
|
||||
failover_timeout: Duration,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: ReplicaState::Initializing,
|
||||
consecutive_successes: 0,
|
||||
consecutive_failures: 0,
|
||||
success_threshold,
|
||||
failure_threshold,
|
||||
failover_timeout,
|
||||
cluster_state: ClusterState::watch(cluster_id),
|
||||
primary_state: HeartbeatState::watch(primary_id),
|
||||
heartbeat_state: HeartbeatState::watch(my_id),
|
||||
last_my_heartbeat: None,
|
||||
last_primary_heartbeat: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn transition_to(&mut self, new_state: ReplicaState) {
|
||||
if self.state != new_state {
|
||||
info!(
|
||||
"State transition: {} -> {}",
|
||||
self.state.name(),
|
||||
new_state.name()
|
||||
);
|
||||
self.state = new_state;
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the primary heartbeat is stale compared to our own
|
||||
/// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
|
||||
async fn is_primary_stale(&mut self) -> bool {
|
||||
if let Some(my_hb) = &self.last_my_heartbeat {
|
||||
if let Some(my_metadata) = &my_hb.metadata {
|
||||
if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
|
||||
let primary_hb = primary_hb_ref.read().await;
|
||||
if let Some(primary_metadata) = &primary_hb.metadata {
|
||||
// Calculate time difference: replica_timestamp - primary_timestamp
|
||||
let time_diff_ms = my_metadata
|
||||
.timestamp
|
||||
.saturating_sub(primary_metadata.timestamp);
|
||||
let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
|
||||
|
||||
trace!(
|
||||
"Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
|
||||
my_metadata.timestamp,
|
||||
primary_metadata.timestamp,
|
||||
time_diff_ms,
|
||||
failover_timeout_ms
|
||||
);
|
||||
|
||||
if time_diff_ms > failover_timeout_ms {
|
||||
info!(
|
||||
"Primary heartbeat stale ({}ms > {}ms), attempting promotion",
|
||||
time_diff_ms, failover_timeout_ms
|
||||
);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl HeartbeatWorkflow for ReplicaWorkflow {
|
||||
async fn on_startup(
|
||||
&self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
) {
|
||||
// todo!("not sure if the replica should do anything on startup")
|
||||
}
|
||||
|
||||
async fn handle_heartbeat_success(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
agent_config: &AgentConfig,
|
||||
) -> Option<crate::agent::ClusterStateData> {
|
||||
trace!(
|
||||
"Handling heartbeat success, current counters success {} failures {}",
|
||||
self.consecutive_successes, self.consecutive_failures
|
||||
);
|
||||
self.consecutive_successes += 1;
|
||||
self.consecutive_failures = 0;
|
||||
|
||||
match self.state {
|
||||
ReplicaState::Initializing => {
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
self.transition_to(ReplicaState::Watching);
|
||||
}
|
||||
None
|
||||
}
|
||||
ReplicaState::Watching => {
|
||||
// TODO: Check primary staleness from NATS
|
||||
trace!("Replica watching primary");
|
||||
if self.is_primary_stale().await {
|
||||
panic!("Found stale primary, launching promotion");
|
||||
}
|
||||
debug!("perform the replica watch actions :
|
||||
- if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
|
||||
- check the last primary heartbeat kv timestamp
|
||||
- compare it with our latest kv heartbeat
|
||||
- if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
|
||||
- launching promotion will change the status of the replica
|
||||
");
|
||||
|
||||
None
|
||||
}
|
||||
ReplicaState::Promoting => {
|
||||
// TODO: Complete promotion attempt
|
||||
trace!("Replica promotion in progress");
|
||||
todo!(
|
||||
"When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
|
||||
);
|
||||
}
|
||||
ReplicaState::PromotionFailed => {
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
self.transition_to(ReplicaState::Watching);
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
ReplicaState::Leader => {
|
||||
// TODO: Check for original primary recovery
|
||||
trace!("Replica acting as leader");
|
||||
todo!()
|
||||
}
|
||||
ReplicaState::Failed => {
|
||||
if self.consecutive_successes >= self.success_threshold {
|
||||
info!("Replica recovered from Failed state, transitioning to Watching");
|
||||
self.transition_to(ReplicaState::Watching);
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
ReplicaState::Demoting => {
|
||||
// TODO: Complete demotion back to watching
|
||||
trace!("Replica demotion in progress");
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_heartbeat_failure(
|
||||
&mut self,
|
||||
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||
) {
|
||||
self.consecutive_failures += 1;
|
||||
self.consecutive_successes = 0;
|
||||
|
||||
// TODO revisit this. I think we should handle the agent healthiness (checking
|
||||
// consecutive_failures against failure_threshold) separately from handling the cluster
|
||||
// state.
|
||||
//
|
||||
// That said, there might be funny stuff we have to do when the agent reaches the failure
|
||||
// threshold, especially in promoting and demoting statuses.
|
||||
|
||||
match self.state {
|
||||
ReplicaState::Watching | ReplicaState::Initializing => {
|
||||
if self.consecutive_failures >= self.failure_threshold {
|
||||
info!(
|
||||
"Replica exceeded failure threshold ({}/{}), transitioning to Failed",
|
||||
self.consecutive_failures, self.failure_threshold
|
||||
);
|
||||
self.transition_to(ReplicaState::Failed);
|
||||
} else {
|
||||
trace!("Replica heartbeat failed, but below threshold");
|
||||
}
|
||||
}
|
||||
ReplicaState::Promoting
|
||||
| ReplicaState::PromotionFailed
|
||||
| ReplicaState::Leader
|
||||
| ReplicaState::Demoting
|
||||
| ReplicaState::Failed => {
|
||||
trace!("Replica heartbeat failed in state: {}", self.state.name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn state_name(&self) -> &'static str {
|
||||
self.state.name()
|
||||
}
|
||||
|
||||
fn consecutive_successes(&self) -> usize {
|
||||
self.consecutive_successes
|
||||
}
|
||||
|
||||
fn consecutive_failures(&self) -> usize {
|
||||
self.consecutive_failures
|
||||
}
|
||||
}
|
||||
12
harmony_execution/Cargo.toml
Normal file
12
harmony_execution/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "harmony_execution"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
lazy_static.workspace = true
|
||||
directories.workspace = true
|
||||
log.workspace = true
|
||||
470
harmony_execution/src/command.rs
Normal file
470
harmony_execution/src/command.rs
Normal file
@@ -0,0 +1,470 @@
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::process::{Child, Command, Stdio};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
/// Captured output from a command execution
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CommandOutput {
|
||||
/// Captured stdout content
|
||||
pub stdout: String,
|
||||
/// Captured stderr content
|
||||
pub stderr: String,
|
||||
/// Exit status of the command
|
||||
pub status: CommandStatus,
|
||||
}
|
||||
|
||||
impl CommandOutput {
|
||||
/// Returns true if the command succeeded
|
||||
pub fn is_success(&self) -> bool {
|
||||
self.status.is_success()
|
||||
}
|
||||
|
||||
/// Formats the complete output for display
|
||||
pub fn format_output(&self) -> String {
|
||||
format!(
|
||||
"Stdout:\n{}\n\nStderr:\n{}",
|
||||
if self.stdout.is_empty() {
|
||||
"<empty>"
|
||||
} else {
|
||||
&self.stdout
|
||||
},
|
||||
if self.stderr.is_empty() {
|
||||
"<empty>"
|
||||
} else {
|
||||
&self.stderr
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Result status of a command execution
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum CommandStatus {
|
||||
/// Command executed successfully (exit code 0)
|
||||
Success,
|
||||
/// Command failed with an exit code
|
||||
Failed(i32),
|
||||
/// Command was terminated by a signal
|
||||
Terminated(i32),
|
||||
/// Command execution could not be started
|
||||
Error(String),
|
||||
}
|
||||
|
||||
impl CommandStatus {
|
||||
pub fn is_success(&self) -> bool {
|
||||
matches!(self, CommandStatus::Success)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::process::ExitStatus> for CommandStatus {
|
||||
fn from(status: std::process::ExitStatus) -> Self {
|
||||
if status.success() {
|
||||
CommandStatus::Success
|
||||
} else if let Some(code) = status.code() {
|
||||
CommandStatus::Failed(code)
|
||||
} else {
|
||||
CommandStatus::Terminated(0) // Signal codes are platform-specific
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type Callback = Arc<dyn Fn(&str) + Send + Sync>;
|
||||
|
||||
/// Options for configuring command execution
|
||||
#[derive(Clone)]
|
||||
pub struct RunnerOptions {
|
||||
/// Whether to print stdout to console in real-time
|
||||
pub print_stdout: bool,
|
||||
/// Whether to print stderr to console in real-time
|
||||
pub print_stderr: bool,
|
||||
/// Optional callback for each stdout line
|
||||
pub stdout_callback: Callback,
|
||||
/// Optional callback for each stderr line
|
||||
pub stderr_callback: Callback,
|
||||
}
|
||||
|
||||
impl RunnerOptions {
|
||||
fn empty_callback() -> Callback {
|
||||
Arc::new(|_| {})
|
||||
}
|
||||
/// Create default options with real-time printing enabled
|
||||
pub fn print_to_console() -> Self {
|
||||
Self {
|
||||
print_stdout: true,
|
||||
print_stderr: true,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create options that capture output silently
|
||||
pub fn silent() -> Self {
|
||||
Self {
|
||||
print_stdout: false,
|
||||
print_stderr: false,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set custom callbacks for stdout and stderr lines
|
||||
pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
|
||||
where
|
||||
F1: Fn(&str) + Send + Sync + 'static,
|
||||
F2: Fn(&str) + Send + Sync + 'static,
|
||||
{
|
||||
self.stdout_callback = Arc::new(stdout_callback);
|
||||
self.stderr_callback = Arc::new(stderr_callback);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RunnerOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
print_stdout: true,
|
||||
print_stderr: true,
|
||||
stdout_callback: Self::empty_callback(),
|
||||
stderr_callback: Self::empty_callback(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error type for command execution failures
|
||||
#[derive(Debug)]
|
||||
pub struct CommandError {
|
||||
/// Human-readable error description
|
||||
pub message: String,
|
||||
/// Captured output if execution started
|
||||
pub output: Option<CommandOutput>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for CommandError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.message)?;
|
||||
if let Some(output) = &self.output {
|
||||
write!(f, "\n{}", output.format_output())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for CommandError {}
|
||||
|
||||
/// Runs a command and captures its output while streaming to console
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use harmony_execution::command::{run_command, RunnerOptions};
|
||||
/// use std::process::Command;
|
||||
///
|
||||
/// let output = run_command(
|
||||
/// Command::new("echo").arg("hello"),
|
||||
/// RunnerOptions::print_to_console()
|
||||
/// ).unwrap();
|
||||
/// assert!(output.is_success());
|
||||
/// assert_eq!(output.stdout, "hello\n");
|
||||
/// ```
|
||||
pub fn run_command(
|
||||
command: &mut Command,
|
||||
options: RunnerOptions,
|
||||
) -> Result<CommandOutput, CommandError> {
|
||||
let mut child = command
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.map_err(|e| CommandError {
|
||||
message: format!("Failed to spawn command: {}", e),
|
||||
output: None,
|
||||
})?;
|
||||
|
||||
let stdout = child.stdout.take().ok_or_else(|| CommandError {
|
||||
message: "Failed to capture stdout".to_string(),
|
||||
output: None,
|
||||
})?;
|
||||
|
||||
let stderr = child.stderr.take().ok_or_else(|| CommandError {
|
||||
message: "Failed to capture stderr".to_string(),
|
||||
output: None,
|
||||
})?;
|
||||
|
||||
let stdout_reader = BufReader::new(stdout);
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
|
||||
let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
|
||||
let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
|
||||
|
||||
// Spawn thread to handle stdout
|
||||
let stdout_handle = thread::spawn(move || {
|
||||
let mut output = String::new();
|
||||
for line in stdout_reader.lines() {
|
||||
match line {
|
||||
Ok(line_content) => {
|
||||
if options.print_stdout {
|
||||
println!("{}", line_content);
|
||||
}
|
||||
(options.stdout_callback)(&line_content);
|
||||
output.push_str(&line_content);
|
||||
output.push('\n');
|
||||
}
|
||||
Err(e) => {
|
||||
// Silently handle read errors - corrupted data at end is common
|
||||
log::trace!("Error reading stdout line: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
let _ = stdout_sender.send(output);
|
||||
});
|
||||
|
||||
// Spawn thread to handle stderr
|
||||
let stderr_handle = thread::spawn(move || {
|
||||
let mut output = String::new();
|
||||
for line in stderr_reader.lines() {
|
||||
match line {
|
||||
Ok(line_content) => {
|
||||
if options.print_stderr {
|
||||
eprintln!("{}", line_content);
|
||||
}
|
||||
(options.stderr_callback)(&line_content);
|
||||
output.push_str(&line_content);
|
||||
output.push('\n');
|
||||
}
|
||||
Err(e) => {
|
||||
log::trace!("Error reading stderr line: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
let _ = stderr_sender.send(output);
|
||||
});
|
||||
|
||||
let status = child.wait().map_err(|e| CommandError {
|
||||
message: format!("Failed to wait for command process: {}", e),
|
||||
output: None,
|
||||
})?;
|
||||
|
||||
let stdout_lines = stdout_handle
|
||||
.join()
|
||||
.map_err(|e| CommandError {
|
||||
message: format!("Stdout thread panicked: {:?}", e),
|
||||
output: None,
|
||||
})
|
||||
.and_then(|_| {
|
||||
stdout_receiver.recv().map_err(|e| CommandError {
|
||||
message: format!("Failed to receive stdout: {}", e),
|
||||
output: None,
|
||||
})
|
||||
})?;
|
||||
|
||||
let stderr_lines = stderr_handle
|
||||
.join()
|
||||
.map_err(|e| CommandError {
|
||||
message: format!("Stderr thread panicked: {:?}", e),
|
||||
output: None,
|
||||
})
|
||||
.and_then(|_| {
|
||||
stderr_receiver.recv().map_err(|e| CommandError {
|
||||
message: format!("Failed to receive stderr: {}", e),
|
||||
output: None,
|
||||
})
|
||||
})?;
|
||||
|
||||
Ok(CommandOutput {
|
||||
stdout: stdout_lines,
|
||||
stderr: stderr_lines,
|
||||
status: status.into(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenience function to run a command with default options (print to console)
|
||||
pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
|
||||
run_command(command, RunnerOptions::print_to_console())
|
||||
}
|
||||
|
||||
/// Convenience function to run a command silently (capture output only)
|
||||
pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
|
||||
run_command(command, RunnerOptions::silent())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::process::Command;
|
||||
|
||||
#[test]
|
||||
fn test_simple_echo_command() {
|
||||
let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
|
||||
assert!(output.is_success());
|
||||
assert_eq!(output.stdout.trim(), "hello world");
|
||||
assert!(output.stderr.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_command_failure() {
|
||||
let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
|
||||
assert!(!output.is_success());
|
||||
assert_eq!(output.status, CommandStatus::Failed(42));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_command_output_format() {
|
||||
let output = run_silent(Command::new("echo").arg("test")).unwrap();
|
||||
let formatted = output.format_output();
|
||||
assert!(formatted.contains("Stdout:"));
|
||||
assert!(formatted.contains("test"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_runner_options() {
|
||||
let opts = RunnerOptions::print_to_console();
|
||||
assert!(opts.print_stdout);
|
||||
assert!(opts.print_stderr);
|
||||
|
||||
let opts = RunnerOptions::silent();
|
||||
assert!(!opts.print_stdout);
|
||||
assert!(!opts.print_stderr);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_command_status_from_exit_status() {
|
||||
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||
assert_eq!(output.status, CommandStatus::Success);
|
||||
|
||||
let output = run_silent(&mut Command::new("false")).unwrap();
|
||||
assert_eq!(output.status, CommandStatus::Failed(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stdout_callback_receives_lines() {
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
let captured = Arc::new(Mutex::new(Vec::new()));
|
||||
let captured_clone = Arc::clone(&captured);
|
||||
|
||||
let opts = RunnerOptions::silent().with_callbacks(
|
||||
move |line| captured_clone.lock().unwrap().push(line.to_string()),
|
||||
|_| {},
|
||||
);
|
||||
|
||||
run_command(Command::new("echo").arg("hello world"), opts).unwrap();
|
||||
|
||||
let lines = captured.lock().unwrap();
|
||||
assert_eq!(lines.len(), 1);
|
||||
assert_eq!(lines[0], "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stderr_callback_receives_lines() {
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
let captured = Arc::new(Mutex::new(Vec::new()));
|
||||
let captured_clone = Arc::clone(&captured);
|
||||
|
||||
let opts = RunnerOptions::silent().with_callbacks(
|
||||
|_| {},
|
||||
move |line| captured_clone.lock().unwrap().push(line.to_string()),
|
||||
);
|
||||
|
||||
run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
|
||||
|
||||
let lines = captured.lock().unwrap();
|
||||
assert_eq!(lines.len(), 1);
|
||||
assert_eq!(lines[0], "error");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_callback_and_capture_both_work() {
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
let callback_lines = Arc::new(Mutex::new(Vec::new()));
|
||||
let callback_clone = Arc::clone(&callback_lines);
|
||||
|
||||
let opts = RunnerOptions::silent().with_callbacks(
|
||||
move |line| callback_clone.lock().unwrap().push(line.to_string()),
|
||||
|_| {},
|
||||
);
|
||||
|
||||
let output =
|
||||
run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
|
||||
|
||||
// Verify captured output
|
||||
assert_eq!(output.stdout, "line1\nline2\nline3\n");
|
||||
|
||||
// Verify callback received all lines
|
||||
let lines = callback_lines.lock().unwrap();
|
||||
assert_eq!(lines.len(), 3);
|
||||
assert_eq!(lines[0], "line1");
|
||||
assert_eq!(lines[1], "line2");
|
||||
assert_eq!(lines[2], "line3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiline_output_capture() {
|
||||
let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
|
||||
|
||||
assert_eq!(output.stdout, "line1\nline2\nline3\n");
|
||||
assert!(output.stderr.trim().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mixed_stdout_stderr_capture() {
|
||||
let output = run_silent(Command::new("sh").args([
|
||||
"-c",
|
||||
"echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
assert!(output.stdout.contains("stdout1"));
|
||||
assert!(output.stdout.contains("stdout2"));
|
||||
assert!(output.stderr.contains("stderr1"));
|
||||
assert!(output.stderr.contains("stderr2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_output_command() {
|
||||
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||
|
||||
assert!(output.stdout.is_empty());
|
||||
assert!(output.stderr.is_empty());
|
||||
assert!(output.is_success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_command_output_format_with_empty_streams() {
|
||||
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||
let formatted = output.format_output();
|
||||
|
||||
assert!(formatted.contains("Stdout:"));
|
||||
assert!(formatted.contains("<empty>"));
|
||||
assert!(formatted.contains("Stderr:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_contains_message_and_output() {
|
||||
let error = CommandError {
|
||||
message: "Test error".to_string(),
|
||||
output: Some(CommandOutput {
|
||||
stdout: "captured stdout".to_string(),
|
||||
stderr: "captured stderr".to_string(),
|
||||
status: CommandStatus::Success,
|
||||
}),
|
||||
};
|
||||
|
||||
let display = format!("{}", error);
|
||||
assert!(display.contains("Test error"));
|
||||
assert!(display.contains("captured stdout"));
|
||||
assert!(display.contains("captured stderr"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_without_output() {
|
||||
let error = CommandError {
|
||||
message: "Spawn failed".to_string(),
|
||||
output: None,
|
||||
};
|
||||
|
||||
let display = format!("{}", error);
|
||||
assert!(display.contains("Spawn failed"));
|
||||
assert!(!display.contains("Stdout:"));
|
||||
assert!(!display.contains("Stderr:"));
|
||||
}
|
||||
}
|
||||
5
harmony_execution/src/lib.rs
Normal file
5
harmony_execution/src/lib.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
pub mod command;
|
||||
|
||||
pub use command::{
|
||||
CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
|
||||
};
|
||||
@@ -32,6 +32,14 @@ impl Id {
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<Id> for &str {
|
||||
fn into(self) -> Id {
|
||||
Id {
|
||||
value: self.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for Id {
|
||||
type Err = ();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user