2026-02-04 21:05:35 +00:00
40 changed files with 4744 additions and 125 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,6 @@
 target/
 Dockerfile
 .git
 data
 target
 demos
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ Cargo.lock
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
 .harmony_generated
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
 dependencies = [
 "cfg-if",
 "const-random",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "once_cell",
 "version_check",
 "zerocopy",
@@ -450,6 +450,43 @@ dependencies = [
 "pin-project-lite",
 ]
 [[package]]
 name = "async-nats"
 version = "0.45.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
 dependencies = [
 "base64 0.22.1",
 "bytes",
 "futures-util",
 "memchr",
 "nkeys",
 "nuid",
 "once_cell",
 "pin-project",
 "portable-atomic",
 "rand 0.8.5",
 "regex",
 "ring",
 "rustls-native-certs 0.7.3",
 "rustls-pemfile 2.2.0",
 "rustls-webpki 0.102.8",
 "serde",
 "serde_json",
 "serde_nanos",
 "serde_repr",
 "thiserror 1.0.69",
 "time",
 "tokio",
 "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-util",
 "tokio-websockets",
 "tracing",
 "tryhard",
 "url",
 ]
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@@ -775,6 +812,9 @@ name = "bytes"
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "bytestring"
@@ -1583,6 +1623,7 @@ dependencies = [
 "rand_core 0.6.4",
 "serde",
 "sha2",
 "signature",
 "subtle",
 "zeroize",
 ]
@@ -2456,21 +2497,21 @@ dependencies = [
 "cfg-if",
 "js-sys",
 "libc",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "wasm-bindgen",
 ]
 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
 "cfg-if",
 "js-sys",
 "libc",
 "r-efi",
- "wasi 0.14.3+wasi-0.2.4",
+ "wasip2",
 "wasm-bindgen",
 ]
@@ -2572,6 +2613,7 @@ dependencies = [
 "env_logger",
 "fqdn",
 "futures-util",
 "harmony_execution",
 "harmony_inventory_agent",
 "harmony_macros",
 "harmony_secret",
@@ -2619,6 +2661,43 @@ dependencies = [
 "walkdir",
 ]
 [[package]]
 name = "harmony_agent"
 version = "0.1.0"
 dependencies = [
 "async-nats",
 "async-trait",
 "cidr",
 "env_logger",
 "getrandom 0.3.4",
 "harmony",
 "harmony_macros",
 "harmony_types",
 "log",
 "pretty_assertions",
 "serde",
 "serde_json",
 "thiserror 2.0.16",
 "tokio",
 ]
 [[package]]
 name = "harmony_agent_deploy"
 version = "0.1.0"
 dependencies = [
 "cidr",
 "env_logger",
 "harmony",
 "harmony_cli",
 "harmony_macros",
 "harmony_types",
 "log",
 "serde",
 "serde_json",
 "tokio",
 "url",
 ]
 [[package]]
 name = "harmony_cli"
 version = "0.1.0"
@@ -2659,6 +2738,16 @@ dependencies = [
 "tokio",
 ]
 [[package]]
 name = "harmony_execution"
 version = "0.1.0"
 dependencies = [
 "directories",
 "lazy_static",
 "log",
 "thiserror 2.0.16",
 ]
 [[package]]
 name = "harmony_inventory_agent"
 version = "0.1.0"
@@ -3523,7 +3612,7 @@ version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "libc",
 ]
@@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "windows-sys 0.48.0",
 ]
@@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
 dependencies = [
 "libc",
 "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "windows-sys 0.59.0",
 ]
@@ -4022,6 +4111,21 @@ dependencies = [
 "unicode-segmentation",
 ]
 [[package]]
 name = "nkeys"
 version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
 dependencies = [
 "data-encoding",
 "ed25519",
 "ed25519-dalek",
 "getrandom 0.2.16",
 "log",
 "rand 0.8.5",
 "signatory",
 ]
 [[package]]
 name = "non-blank-string-rs"
 version = "1.0.4"
@@ -4040,6 +4144,15 @@ dependencies = [
 "winapi 0.3.9",
 ]
 [[package]]
 name = "nuid"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
 dependencies = [
 "rand 0.8.5",
 ]
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
 dependencies = [
 "bytes",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "lru-slab",
 "rand 0.9.2",
 "ring",
@@ -4765,7 +4878,7 @@ version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 ]
 [[package]]
@@ -5301,6 +5414,16 @@ dependencies = [
 "untrusted",
 ]
 [[package]]
 name = "rustls-webpki"
 version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
 "rustls-pki-types",
 "untrusted",
 ]
 [[package]]
 name = "rustls-webpki"
 version = "0.103.4"
@@ -5564,6 +5687,15 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "serde_nanos"
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "serde_path_to_error"
 version = "0.1.17"
@@ -5731,6 +5863,18 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "signatory"
 version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
 dependencies = [
 "pkcs8",
 "rand_core 0.6.4",
 "signature",
 "zeroize",
 ]
 [[package]]
 name = "signature"
 version = "2.2.0"
@@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
 dependencies = [
 "fastrand",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "once_cell",
 "rustix 1.0.8",
 "windows-sys 0.60.2",
@@ -6538,6 +6682,27 @@ dependencies = [
 "tokio",
 ]
 [[package]]
 name = "tokio-websockets"
 version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
 dependencies = [
 "base64 0.22.1",
 "bytes",
 "futures-core",
 "futures-sink",
 "http 1.3.1",
 "httparse",
 "rand 0.8.5",
 "ring",
 "rustls-pki-types",
 "tokio",
 "tokio-rustls 0.26.2",
 "tokio-util",
 "webpki-roots 0.26.11",
 ]
 [[package]]
 name = "toml"
 version = "0.8.23"
@@ -6689,6 +6854,16 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 [[package]]
 name = "tryhard"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
 dependencies = [
 "pin-project-lite",
 "tokio",
 ]
 [[package]]
 name = "tui-logger"
 version = "0.14.5"
@@ -6865,7 +7040,7 @@ version = "1.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "js-sys",
 "rand 0.9.2",
 "uuid-macro-internal",
@@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 [[package]]
-name = "wasi"
+name = "wasip2"
-version = "0.14.3+wasi-0.2.4"
+version = "1.0.2+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
 dependencies = [
 "wit-bindgen",
 ]
@@ -7061,6 +7236,15 @@ version = "0.25.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
 [[package]]
 name = "webpki-roots"
 version = "0.26.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
 dependencies = [
 "webpki-roots 1.0.2",
 ]
 [[package]]
 name = "webpki-roots"
 version = "1.0.2"
@@ -7438,9 +7622,9 @@ dependencies = [
 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
 [[package]]
 name = "writeable"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
  "harmony_types",
  "harmony_macros",
  "harmony_tui",
  "harmony_execution",
  "opnsense-config",
  "opnsense-config-xml",
  "harmony_cli",
@@ -17,6 +18,8 @@ members = [
  "harmony_secret",
  "adr/agent_discovery/mdns",
  "brocade",
  "harmony_agent",
  "harmony_agent/deploy",
 ]
 [workspace.package]
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
 In other words, Harmony is a **next-generation platform engineering framework**.
 _By [NationTech](https://nationtech.io)_
 [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony)
--- a/adr/018-Template-Hydration-For-Workload-Deployment.md
+++ b/adr/018-Template-Hydration-For-Workload-Deployment.md
@@ -0,0 +1,141 @@
 # Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
 Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
 Initial Date: 2025-01-23
 Last Updated Date: 2025-01-23
 ## Status
 Implemented
 ## Context
 Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
 After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
 *   **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
 *   **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
 *   **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
 *   **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
 We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
 ## Decision
 We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
 Specifically:
 *   **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
 *   **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
 *   **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
 *   **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
 The implementation in `backend_app.rs` demonstrates this pattern:
 ```rust
 let deployment = Deployment {
    metadata: ObjectMeta {
        name: Some(self.name.clone()),
        labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
        ..Default::default()
    },
    spec: Some(DeploymentSpec { /* ... */ }),
    ..Default::default()
 };
 let deployment_yaml = serde_yaml::to_string(&deployment)?;
 fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
 ```
 ## Rationale
 **Aligns with "Infrastructure as Resilient Code"**
 Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
 *   **Refactorability:** Rename a label and the compiler catches all usages.
 *   **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
 *   **Code Navigation:** Jump to definition shows exactly where a value comes from.
 **Achieves "Prove It Works — Before You Deploy"**
 The compiler now validates that:
 *   All required fields are populated (Rust's `Option` type prevents missing fields).
 *   Field types match expectations (ports are integers, not strings).
 *   Enums contain valid values (e.g., `ServiceType::ClusterIP`).
 This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
 **Enables True Unit Testing**
 Developers can now write unit tests that assert directly against typed objects:
 ```rust
 let deployment = create_deployment(&app);
 assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
 assert_eq!(deployment.metadata.name.unwrap(), "my-app");
 ```
 No string parsing, no YAML serialization, no fragile assertions against rendered output.
 **Preserves Ecosystem Benefits**
 By generating standard Helm chart structures, Harmony retains compatibility with:
 *   **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
 *   **ArgoCD:** Syncs and manages releases using the generated charts.
 *   **Existing Workflows:** Teams already consuming Helm charts see no change.
 The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
 ## Consequences
 ### Positive
 *   **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
 *   **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
 *   **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
 *   **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
 *   **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
 ### Negative
 *   **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
 *   **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
 *   **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
 *   **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
 ## Alternatives Considered
 ### 1. Enhance Askama with Compile-Time Validation
 *Pros:* Stay within familiar templating paradigm; minimal code changes.
 *Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
 ### 2. Use Helm SDK Programmatically (Go)
 *Pros:* Direct access to Helm's template engine; no YAML serialization step.
 *Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
 ### 3. Raw YAML String Templating (Manual)
 *Pros:* Maximum control; no external dependencies.
 *Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
 ### 4. Use Kustomize for All Manifests
 *Pros:* Declarative overlays; standard tool.
 *Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
 __Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
 ## Additional Notes
 **Scalability to Future Topologies**
 The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
 **Implementation Status**
 As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
--- a/brocade/examples/main.rs
+++ b/brocade/examples/main.rs
@@ -1,7 +1,7 @@
 use std::net::{IpAddr, Ipv4Addr};
 use brocade::{BrocadeOptions, ssh};
-use harmony_secret::{Secret, SecretManager};
+use harmony_secret::Secret;
 use harmony_types::switch::PortLocation;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
--- a/examples/openbao/src/main.rs
+++ b/examples/openbao/src/main.rs
@@ -56,6 +56,8 @@ async fn main() {
        )),
    };
    // TODO exec pod commands to initialize secret store if not already done
    harmony_cli::run(
        Inventory::autoload(),
        K8sAnywhereTopology::from_env(),
--- a/harmony/Cargo.toml
+++ b/harmony/Cargo.toml
@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
 opnsense-config-xml = { path = "../opnsense-config-xml" }
 harmony_macros = { path = "../harmony_macros" }
 harmony_types = { path = "../harmony_types" }
 harmony_execution = { path = "../harmony_execution" }
 uuid.workspace = true
 url.workspace = true
 kube = { workspace = true, features = ["derive"] }
--- a/harmony/src/modules/application/backend_app.rs
+++ b/harmony/src/modules/application/backend_app.rs
@@ -0,0 +1,801 @@
 use async_trait::async_trait;
 use log::{debug, info, trace};
 use serde::Serialize;
 use std::path::PathBuf;
 use crate::{
    config::{REGISTRY_PROJECT, REGISTRY_URL},
    modules::application::{
        Application, HelmPackage, OCICompliant,
        config::ApplicationNetworkPort,
        helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
    },
 };
 use harmony_execution::{RunnerOptions, run_command};
 #[derive(Debug, Clone, Serialize)]
 pub struct BuildCommand {
    pub program: String,
    pub args: Vec<String>,
 }
 impl BuildCommand {
    pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
        Self {
            program: program.into(),
            args: args.into_iter().map(|s| s.into()).collect(),
        }
    }
    pub fn to_std_command(&self) -> std::process::Command {
        let mut cmd = std::process::Command::new(&self.program);
        cmd.args(&self.args);
        cmd
    }
 }
 #[derive(Debug, Clone, Serialize)]
 pub struct BackendApp {
    pub name: String,
    pub project_root: std::path::PathBuf,
    pub network_ports: Vec<ApplicationNetworkPort>,
    pub env_vars: Vec<(String, String)>,
    pub build_cmd: BuildCommand,
    pub dockerfile: Option<PathBuf>,
 }
 impl BackendApp {
    fn get_dockerfile(&self) -> Result<PathBuf, String> {
        debug!(
            "Looking for dockerfile, currently set to {:?}",
            self.dockerfile
        );
        if let Some(dockerfile) = &self.dockerfile {
            return match dockerfile.exists() {
                true => {
                    info!(
                        "Found dockerfile as intended at {}",
                        dockerfile.to_string_lossy()
                    );
                    Ok(dockerfile.clone())
                }
                false => Err(format!(
                    "Dockerfile explicitely set to {dockerfile} does not exist",
                    dockerfile = dockerfile.to_string_lossy()
                )),
            };
        }
        let existing_dockerfile = self.project_root.join("Dockerfile");
        debug!("project_root = {:?}", self.project_root);
        debug!("checking = {:?}", existing_dockerfile);
        if existing_dockerfile.exists() {
            debug!(
                "Checking path {:#?} for existing Dockerfile",
                self.project_root.clone()
            );
            return Ok(existing_dockerfile);
        }
        Err(format!(
            "Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
            project_root = self.project_root.to_string_lossy(),
            existing_dockerfile = existing_dockerfile.to_string_lossy(),
        ))
    }
 }
 impl Application for BackendApp {
    fn name(&self) -> String {
        self.name.clone()
    }
 }
 #[async_trait]
 impl OCICompliant for BackendApp {
    async fn build_push_oci_image(&self) -> Result<String, String> {
        let dockerfile = self.get_dockerfile()?;
        let image_tag = self.image_name();
        // Run docker build command, streaming output to console and capturing it
        let output = run_command(
            std::process::Command::new("docker").args([
                "build",
                "-t",
                &image_tag,
                "-f",
                &dockerfile.to_string_lossy(),
                &self.project_root.to_string_lossy(),
            ]),
            RunnerOptions::print_to_console(),
        )
        .map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
        if output.is_success() {
            info!("Docker image build succeeded");
            Ok(image_tag)
        } else {
            Err(format!(
                "Docker image build FAILED:\n{}",
                output.format_output()
            ))
        }
    }
    fn local_image_name(&self) -> String {
        self.name.clone()
    }
    fn image_name(&self) -> String {
        format!(
            "{}/{}/{}",
            *REGISTRY_URL,
            *REGISTRY_PROJECT,
            &self.local_image_name()
        )
    }
 }
 #[async_trait]
 impl HelmPackage for BackendApp {
    fn project_root(&self) -> PathBuf {
        self.project_root.clone()
    }
    fn chart_name(&self) -> String {
        self.name.clone()
    }
    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
        let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
        // Build the typed Deployment object using the builder with initial options
        helm_chart.add_resource(HelmResourceKind::Deployment(
            DeploymentBuilder::with_options(
                &self.name,
                image_url,
                Some(self.network_ports.clone()),
                Some(self.env_vars.clone()),
                None,
            )
            .build(),
        ));
        // Build the typed Service object using the helper function
        if let Some(service) =
            helm::create_service_from_ports(self.name.clone(), &self.network_ports)
        {
            helm_chart.add_resource(HelmResourceKind::Service(service));
        }
        // Write the Helm chart metadata to the project root
        let chart_dir = helm_chart
            .write_to(&self.project_root.join(".harmony_generated/helm/"))
            .map_err(|e| format!("Failed to write Helm chart: {}", e))?;
        info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
        Ok(chart_dir.to_string_lossy().to_string())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::modules::application::config::ApplicationNetworkPort;
    use crate::modules::application::config::NetworkProtocol;
    use k8s_openapi::api::apps::v1::Deployment;
    use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
    use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
    use serde_yaml::from_str;
    use std::fs;
    use std::path::Path;
    use tempfile::tempdir;
    // Test Helpers
    fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
        let path = project_root.join(format!(
            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
        ));
        let content = fs::read_to_string(&path)
            .unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
        from_str(&content)
            .unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
    }
    fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
        let path = project_root.join(format!(
            ".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
        ));
        let content = fs::read_to_string(&path)
            .unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
        from_str(&content)
            .unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
    }
    fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
        let path = project_root.join(format!(
            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
        ));
        path.exists()
    }
    // Service Assertions
    fn assert_service_metadata(service: &K8sService, expected_name: &str) {
        assert_eq!(
            service.metadata.name.as_deref(),
            Some(expected_name),
            "Service name should be '{expected_name}'"
        );
    }
    fn assert_service_type(service: &K8sService, expected_type: &str) {
        assert_eq!(
            service.spec.as_ref().and_then(|s| s.type_.as_deref()),
            Some(expected_type),
            "Service type should be '{expected_type}'"
        );
    }
    fn assert_service_port_count(service: &K8sService, expected_count: usize) {
        let ports = service
            .spec
            .as_ref()
            .and_then(|s| s.ports.as_ref())
            .unwrap_or_else(|| panic!("Service should have ports"));
        assert_eq!(
            ports.len(),
            expected_count,
            "Service should have {expected_count} ports"
        );
    }
    fn assert_service_port(
        port: &ServicePort,
        expected_name: &str,
        expected_protocol: &str,
        expected_number: i32,
    ) {
        assert_eq!(
            port.name.as_deref(),
            Some(expected_name),
            "Port name should be '{expected_name}'"
        );
        assert_eq!(
            port.protocol.as_deref(),
            Some(expected_protocol),
            "Port '{expected_name}' protocol should be '{expected_protocol}'"
        );
        assert_eq!(
            port.port, expected_number,
            "Port '{expected_name}' number should be {expected_number}"
        );
    }
    fn assert_target_port_matches_service_port(port: &ServicePort) {
        match &port.target_port {
            Some(IntOrString::Int(target)) => {
                assert_eq!(
                    *target,
                    port.port,
                    "Target port should match service port for '{}'",
                    port.name.as_deref().unwrap_or("unknown")
                );
            }
            _ => panic!(
                "Target port should be Int for '{}'",
                port.name.as_deref().unwrap_or("unknown")
            ),
        }
    }
    // Deployment Assertions
    fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
        assert_eq!(
            deployment.metadata.name.as_deref(),
            Some(expected_name),
            "Deployment name should be '{expected_name}'"
        );
    }
    fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
        let spec = deployment
            .spec
            .as_ref()
            .unwrap_or_else(|| panic!("Deployment should have spec"));
        assert_eq!(
            spec.replicas,
            Some(expected_replicas),
            "Deployment should have {expected_replicas} replicas"
        );
    }
    fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
        let spec = deployment
            .spec
            .as_ref()
            .unwrap_or_else(|| panic!("Deployment should have spec"));
        assert_eq!(
            spec.selector
                .match_labels
                .as_ref()
                .and_then(|m| m.get("app.kubernetes.io/name")),
            Some(&expected_label_value.to_string()),
            "Selector should match app name '{expected_label_value}'"
        );
    }
    fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
        let spec = deployment
            .spec
            .as_ref()
            .unwrap_or_else(|| panic!("Deployment should have spec"));
        let metadata = spec
            .template
            .metadata
            .as_ref()
            .unwrap_or_else(|| panic!("Pod template should have metadata"));
        let labels = metadata
            .labels
            .as_ref()
            .unwrap_or_else(|| panic!("Pod should have labels"));
        assert_eq!(
            labels.get("app.kubernetes.io/name"),
            Some(&expected_name.to_string()),
            "Pod label app.kubernetes.io/name should be '{expected_name}'"
        );
        assert_eq!(
            labels.get("app.kubernetes.io/instance"),
            Some(&expected_name.to_string()),
            "Pod label app.kubernetes.io/instance should be '{expected_name}'"
        );
    }
    // Container Assertions
    fn assert_container_metadata(
        container: &Container,
        expected_name: &str,
        expected_image: &str,
        expected_pull_policy: &str,
    ) {
        assert_eq!(
            container.name, expected_name,
            "Container name should be '{expected_name}'"
        );
        assert_eq!(
            container.image.as_deref(),
            Some(expected_image),
            "Container image should be '{expected_image}'"
        );
        assert_eq!(
            container.image_pull_policy.as_deref(),
            Some(expected_pull_policy),
            "Image pull policy should be '{expected_pull_policy}'"
        );
    }
    fn assert_container_ports_count(container: &Container, expected_count: usize) {
        let ports = container
            .ports
            .as_ref()
            .unwrap_or_else(|| panic!("Container should have ports"));
        assert_eq!(
            ports.len(),
            expected_count,
            "Container should have {expected_count} ports"
        );
    }
    fn assert_container_port(
        port: &k8s_openapi::api::core::v1::ContainerPort,
        expected_name: &str,
        expected_protocol: &str,
        expected_number: i32,
    ) {
        assert_eq!(
            port.name.as_deref(),
            Some(expected_name),
            "Container port name should be '{expected_name}'"
        );
        assert_eq!(
            port.protocol.as_deref(),
            Some(expected_protocol),
            "Container port '{expected_name}' protocol should be '{expected_protocol}'"
        );
        assert_eq!(
            port.container_port, expected_number,
            "Container port '{expected_name}' number should be {expected_number}"
        );
    }
    fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
        let env_vars = container
            .env
            .as_ref()
            .unwrap_or_else(|| panic!("Container should have env vars"));
        assert_eq!(
            env_vars.len(),
            expected_count,
            "Container should have {expected_count} env vars"
        );
    }
    fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
        assert_eq!(
            env_var.name, expected_name,
            "Env var name should be '{expected_name}'"
        );
        assert_eq!(
            env_var.value.as_deref(),
            Some(expected_value),
            "Env var '{expected_name}' value should be '{expected_value}'"
        );
    }
    fn get_container(deployment: &Deployment) -> Container {
        let spec = deployment
            .spec
            .as_ref()
            .unwrap_or_else(|| panic!("Deployment should have spec"));
        let pod_spec = spec
            .template
            .spec
            .as_ref()
            .unwrap_or_else(|| panic!("Pod template should have spec"));
        pod_spec
            .containers
            .first()
            .unwrap_or_else(|| panic!("Should have exactly one container"))
            .clone()
    }
    // Test Fixtures
    fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
        vec![
            ApplicationNetworkPort {
                number: 8080,
                protocol: NetworkProtocol::TCP,
                name: "http".to_string(),
            },
            ApplicationNetworkPort {
                number: 9000,
                protocol: NetworkProtocol::TCP,
                name: "metrics".to_string(),
            },
            ApplicationNetworkPort {
                number: 50051,
                protocol: NetworkProtocol::TCP,
                name: "grpc".to_string(),
            },
        ]
    }
    fn standard_test_env_vars() -> Vec<(String, String)> {
        vec![
            ("ENV_VAR_1".to_string(), "value1".to_string()),
            ("ENV_VAR_2".to_string(), "value2".to_string()),
        ]
    }
    fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
        vec![
            ApplicationNetworkPort {
                number: 53,
                protocol: NetworkProtocol::UDP,
                name: "dns".to_string(),
            },
            ApplicationNetworkPort {
                number: 8080,
                protocol: NetworkProtocol::TCP,
                name: "http".to_string(),
            },
        ]
    }
    // Test Builder
    struct BackendAppTestBuilder {
        name: Option<String>,
        network_ports: Option<Vec<ApplicationNetworkPort>>,
        env_vars: Option<Vec<(String, String)>>,
    }
    impl BackendAppTestBuilder {
        fn new() -> Self {
            Self {
                name: None,
                network_ports: None,
                env_vars: None,
            }
        }
        fn with_name(mut self, name: impl Into<String>) -> Self {
            self.name = Some(name.into());
            self
        }
        fn with_standard_ports(mut self) -> Self {
            self.network_ports = Some(standard_test_ports());
            self
        }
        fn with_udp_ports(mut self) -> Self {
            self.network_ports = Some(udp_test_ports());
            self
        }
        fn with_standard_env_vars(mut self) -> Self {
            self.env_vars = Some(standard_test_env_vars());
            self
        }
        fn with_no_ports(mut self) -> Self {
            self.network_ports = Some(vec![]);
            self
        }
        fn build(self, project_root: PathBuf) -> BackendApp {
            BackendApp {
                name: self.name.unwrap_or_else(|| "test-app".to_string()),
                project_root,
                network_ports: self.network_ports.unwrap_or_default(),
                env_vars: self.env_vars.unwrap_or_default(),
                build_cmd: BuildCommand::new("cargo", vec!["build"]),
                dockerfile: None,
            }
        }
    }
    impl Default for BackendAppTestBuilder {
        fn default() -> Self {
            Self::new()
        }
    }
    // Helper function for test setup
    async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
        let result = app.build_push_helm_package(image_url).await;
        assert!(
            result.is_ok(),
            "build_push_helm_package should succeed: {:?}",
            result
        );
    }
    // ===== SERVICE TESTS =====
    #[tokio::test]
    async fn service_is_created_with_application_name() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let service = read_service_yaml(&app.project_root, "test-app");
        assert_service_metadata(&service, "test-app");
    }
    #[tokio::test]
    async fn service_has_default_clusterip_type() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let service = read_service_yaml(&app.project_root, "test-app");
        assert_service_type(&service, "ClusterIP");
    }
    #[tokio::test]
    async fn service_exposes_all_network_ports() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let service = read_service_yaml(&app.project_root, "test-app");
        assert_service_port_count(&service, 3);
        let ports = service.spec.unwrap().ports.unwrap();
        assert_service_port(&ports[0], "http", "TCP", 8080);
        assert_service_port(&ports[1], "metrics", "TCP", 9000);
        assert_service_port(&ports[2], "grpc", "TCP", 50051);
    }
    #[tokio::test]
    async fn service_target_ports_match_service_ports() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let service = read_service_yaml(&app.project_root, "test-app");
        let ports = service.spec.unwrap().ports.unwrap();
        for port in &ports {
            assert_target_port_matches_service_port(port);
        }
    }
    #[tokio::test]
    async fn service_not_created_when_application_has_no_ports() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app-no-ports")
            .with_no_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
        assert!(
            !service_yaml_exists(&app.project_root, "test-app-no-ports"),
            "service.yaml should not exist when there are no network ports"
        );
    }
    #[tokio::test]
    async fn service_respects_port_protocol_type() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("udp-app")
            .with_udp_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
        let service = read_service_yaml(&app.project_root, "udp-app");
        let ports = service.spec.unwrap().ports.unwrap();
        assert_service_port(&ports[0], "dns", "UDP", 53);
        assert_service_port(&ports[1], "http", "TCP", 8080);
    }
    // ===== DEPLOYMENT METADATA TESTS =====
    #[tokio::test]
    async fn deployment_has_application_name() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        assert_deployment_metadata(&deployment, "test-app");
    }
    #[tokio::test]
    async fn deployment_has_single_replica_by_default() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        assert_deployment_replicas(&deployment, 1);
    }
    #[tokio::test]
    async fn deployment_selector_matches_application_name() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        assert_selector_match_label(&deployment, "test-app");
    }
    #[tokio::test]
    async fn pod_has_standard_kubernetes_labels() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        assert_pod_labels(&deployment, "test-app");
    }
    // ===== CONTAINER CONFIGURATION TESTS =====
    #[tokio::test]
    async fn container_has_correct_name_and_image() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        let image_url = "registry.example.com/test/test-app:1.0.0";
        build_helm_chart_for_test(&app, image_url).await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        let container = get_container(&deployment);
        assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
    }
    #[tokio::test]
    async fn container_exposes_all_application_ports() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        let container = get_container(&deployment);
        assert_container_ports_count(&container, 3);
        let ports = container.ports.unwrap();
        assert_container_port(&ports[0], "http", "TCP", 8080);
        assert_container_port(&ports[1], "metrics", "TCP", 9000);
        assert_container_port(&ports[2], "grpc", "TCP", 50051);
    }
    #[tokio::test]
    async fn container_has_all_environment_variables() {
        let temp_dir = tempdir().expect("Failed to create temp directory");
        let app = BackendAppTestBuilder::new()
            .with_name("test-app")
            .with_standard_ports()
            .with_standard_env_vars()
            .build(temp_dir.path().to_path_buf());
        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
        let deployment = read_deployment_yaml(&app.project_root, "test-app");
        let container = get_container(&deployment);
        assert_container_env_vars_count(&container, 2);
        let env_vars = container.env.unwrap();
        assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
        assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
    }
    // ===== BUILD COMMAND UNIT TESTS =====
    #[test]
    fn build_command_creation_sets_program_and_args() {
        let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
        assert_eq!(cmd.program, "docker");
        assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
    }
    #[test]
    fn build_command_clone_copies_all_fields() {
        let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
        let cmd2 = cmd1.clone();
        assert_eq!(cmd1.program, cmd2.program);
        assert_eq!(cmd1.args, cmd2.args);
    }
 }
--- a/harmony/src/modules/application/config.rs
+++ b/harmony/src/modules/application/config.rs
@@ -0,0 +1,29 @@
 use serde::Serialize;
 #[derive(Debug, Clone, Serialize)]
 pub enum NetworkProtocol {
    TCP,
    UDP,
 }
 impl NetworkProtocol {
    pub fn as_str(&self) -> &str {
        match self {
            NetworkProtocol::TCP => "TCP",
            NetworkProtocol::UDP => "UDP",
        }
    }
 }
 impl std::fmt::Display for NetworkProtocol {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
 }
 #[derive(Debug, Clone, Serialize)]
 pub struct ApplicationNetworkPort {
    pub number: u16,
    pub protocol: NetworkProtocol,
    pub name: String,
 }
--- a/harmony/src/modules/application/features/packaging_deployment.rs
+++ b/harmony/src/modules/application/features/packaging_deployment.rs
@@ -48,11 +48,11 @@ use crate::{
 /// - ArgoCD to install/upgrade/rollback/inspect k8s resources
 /// - Kubernetes for runtime orchestration
 #[derive(Debug, Default, Clone)]
-pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> {
+pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
    pub application: Arc<A>,
 }
-impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
+impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
    async fn deploy_to_local_k3d(
        &self,
        app_name: String,
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
 #[async_trait]
 impl<
-    A: OCICompliant + HelmPackage + Webapp + Clone + 'static,
+    A: OCICompliant + HelmPackage + Clone + 'static,
    T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
 > ApplicationFeature<T> for PackagingDeployment<A>
 {
@@ -148,24 +148,12 @@ impl<
    ) -> Result<InstallationOutcome, InstallationError> {
        let image = self.application.image_name();
        let domain = if topology.current_target() == DeploymentTarget::Production {
            self.application.dns()
        } else {
            topology
                .get_domain(&self.application.name())
                .await
                .map_err(|e| e.to_string())?
        };
        // TODO Write CI/CD workflow files
        // we can autotedect the CI type using the remote url (default to github action for github
        // url, etc..)
        // Or ask for it when unknown
-        let helm_chart = self
+        let helm_chart = self.application.build_push_helm_package(&image).await?;
            .application
            .build_push_helm_package(&image, &domain)
            .await?;
        // TODO: Make building image configurable/skippable if image already exists (prompt)")
        // https://git.nationtech.io/NationTech/harmony/issues/104
@@ -215,12 +203,12 @@ impl<
        };
        Ok(InstallationOutcome::success_with_details(vec![format!(
-            "{}: http://{domain}",
+            "{}",
            self.application.name()
        )]))
    }
    fn name(&self) -> String {
-        "ContinuousDelivery".to_string()
+        "PackagingDeployment".to_string()
    }
 }
--- a/harmony/src/modules/application/helm/mod.rs
+++ b/harmony/src/modules/application/helm/mod.rs
@@ -0,0 +1,446 @@
 // Re-export common Kubernetes types for convenience
 pub use k8s_openapi::api::{
    apps::v1::{Deployment, DeploymentSpec},
    core::v1::{
        Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
        ServicePort, ServiceSpec,
    },
 };
 use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
 use kube::core::ObjectMeta;
 // Import domain types for the deployment builder
 use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
 use std::fs;
 use std::path::{Path, PathBuf};
 /// Enum representing all supported Kubernetes resource types for Helm charts.
 /// Supports built-in typed resources and custom CRDs via YAML strings.
 pub enum HelmResourceKind {
    /// Built-in typed Service resource
    Service(K8sService),
    /// Built-in typed Deployment resource
    Deployment(Deployment),
    /// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
    CustomYaml { filename: String, content: String },
    // Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
 }
 impl HelmResourceKind {
    pub fn filename(&self) -> String {
        match self {
            HelmResourceKind::Service(_) => "service.yaml".to_string(),
            HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
            HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
        }
    }
    pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
        match self {
            HelmResourceKind::Service(s) => serde_yaml::to_string(s),
            HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
            HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
        }
    }
    pub fn as_service(&self) -> Option<&K8sService> {
        match self {
            HelmResourceKind::Service(s) => Some(s),
            _ => None,
        }
    }
    pub fn as_deployment(&self) -> Option<&Deployment> {
        match self {
            HelmResourceKind::Deployment(d) => Some(d),
            _ => None,
        }
    }
    /// Add a custom resource from any serializable type (e.g., CRDs, custom types)
    pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
        HelmResourceKind::CustomYaml {
            filename: filename.into(),
            content: content.into(),
        }
    }
    /// Add a custom resource from any type that implements Serialize
    pub fn from_serializable<T: serde::Serialize>(
        filename: impl Into<String>,
        resource: &T,
    ) -> Result<Self, serde_yaml::Error> {
        Ok(HelmResourceKind::CustomYaml {
            filename: filename.into(),
            content: serde_yaml::to_string(resource)?,
        })
    }
 }
 /// The main orchestrator for building a Helm chart.
 pub struct HelmChart {
    pub name: String,
    pub version: String,
    pub app_version: String,
    pub description: String,
    pub resources: Vec<HelmResourceKind>,
    pub values: Vec<String>,
 }
 impl HelmChart {
    pub fn new(name: String, app_version: String) -> Self {
        Self {
            name: name.clone(),
            version: "0.1.0".to_string(),
            app_version,
            description: format!("A Helm chart for {}", name),
            resources: Vec::new(),
            values: Vec::new(),
        }
    }
    pub fn add_resource(&mut self, resource: HelmResourceKind) {
        self.resources.push(resource);
    }
    pub fn add_value(&mut self, key: &str, value: &str) {
        self.values.push(format!("{}: {}", key, value));
    }
    pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
        let chart_dir = base_path.join(&self.name);
        let templates_dir = chart_dir.join("templates");
        fs::create_dir_all(&templates_dir)?;
        // 1. Render and write Chart.yaml
        let chart_yaml = ChartYaml {
            name: &self.name,
            description: &self.description,
            version: &self.version,
            app_version: &self.app_version,
        };
        fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
        // 2. Write values.yaml (Constructed dynamically)
        let values_content = self.values.join("\n");
        fs::write(chart_dir.join("values.yaml"), values_content)?;
        // 3. Serialize and write all added resources (Deployment, Service, etc.)
        for resource in &self.resources {
            let filename = resource.filename();
            let content = resource
                .serialize_to_yaml()
                .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
            fs::write(templates_dir.join(filename), content)?;
        }
        Ok(chart_dir)
    }
 }
 use askama::Template;
 #[derive(Template)]
 #[template(path = "helm/Chart.yaml.j2")]
 struct ChartYaml<'a> {
    name: &'a str,
    description: &'a str,
    version: &'a str,
    app_version: &'a str,
 }
 /// Builder for creating a Kubernetes Service with proper labels and selectors.
 pub struct ServiceBuilder {
    name: String,
    service_type: String,
    ports: Vec<ServicePort>,
    selector_label: String,
 }
 impl ServiceBuilder {
    pub fn new(name: impl Into<String>) -> Self {
        Self {
            name: name.into(),
            service_type: "ClusterIP".to_string(),
            ports: Vec::new(),
            selector_label: String::new(),
        }
    }
    pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
        self.service_type = service_type.into();
        self
    }
    pub fn with_port(
        mut self,
        name: impl Into<String>,
        port: i32,
        protocol: impl Into<String>,
    ) -> Self {
        use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
        self.ports.push(ServicePort {
            name: Some(name.into()),
            protocol: Some(protocol.into()),
            port,
            target_port: Some(IntOrString::Int(port)),
            ..Default::default()
        });
        self
    }
    pub fn selector_label(mut self, label: impl Into<String>) -> Self {
        self.selector_label = label.into();
        self
    }
    pub fn build(self) -> K8sService {
        K8sService {
            metadata: ObjectMeta {
                name: Some(self.name.clone()),
                labels: Some(
                    [
                        ("app.kubernetes.io/name".to_string(), self.name.clone()),
                        (
                            "app.kubernetes.io/component".to_string(),
                            "service".to_string(),
                        ),
                        (
                            "app.kubernetes.io/managed-by".to_string(),
                            "harmony".to_string(),
                        ),
                    ]
                    .into(),
                ),
                ..Default::default()
            },
            spec: Some(ServiceSpec {
                type_: Some(self.service_type),
                selector: Some(
                    [("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
                ),
                ports: if self.ports.is_empty() {
                    None
                } else {
                    Some(self.ports)
                },
                ..Default::default()
            }),
            ..Default::default()
        }
    }
 }
 /// Builder for creating a Kubernetes Deployment with pod template and container spec.
 pub struct DeploymentBuilder {
    name: String,
    image: String,
    replicas: i32,
    container_ports: Vec<ContainerPort>,
    env_vars: Vec<EnvVar>,
    image_pull_policy: Option<String>,
 }
 impl DeploymentBuilder {
    /// Create a new DeploymentBuilder with minimal required fields.
    pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
        Self::with_options(name, image, None, None, None)
    }
    /// Create a new DeploymentBuilder with optional initial configuration.
    ///
    /// Arguments:
    /// - `name`: The deployment name
    /// - `image`: The container image to use
    /// - `ports`: Optional vector of initial application network ports
    /// - `env_vars`: Optional vector of initial environment variable key-value pairs
    /// - `replicas`: Optional number of replicas (defaults to 1)
    pub fn with_options(
        name: impl Into<String>,
        image: impl Into<String>,
        ports: Option<Vec<ApplicationNetworkPort>>,
        env_vars: Option<Vec<(String, String)>>,
        replicas: Option<i32>,
    ) -> Self {
        let container_ports: Vec<ContainerPort> = ports
            .unwrap_or_default()
            .into_iter()
            .map(|port| ContainerPort {
                container_port: port.number as i32,
                name: Some(port.name),
                protocol: Some(port.protocol.to_string()),
                ..Default::default()
            })
            .collect();
        let k8s_env_vars: Vec<EnvVar> = env_vars
            .unwrap_or_default()
            .into_iter()
            .map(|(key, value)| EnvVar {
                name: key,
                value: Some(value),
                ..Default::default()
            })
            .collect();
        Self {
            name: name.into(),
            image: image.into(),
            replicas: replicas.unwrap_or(1),
            container_ports,
            env_vars: k8s_env_vars,
            image_pull_policy: Some("IfNotPresent".to_string()),
        }
    }
    pub fn replicas(mut self, replicas: i32) -> Self {
        self.replicas = replicas;
        self
    }
    pub fn with_container_port(
        mut self,
        number: i32,
        name: impl Into<String>,
        protocol: impl Into<String>,
    ) -> Self {
        self.container_ports.push(ContainerPort {
            container_port: number,
            name: Some(name.into()),
            protocol: Some(protocol.into()),
            ..Default::default()
        });
        self
    }
    pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
        self.env_vars.push(EnvVar {
            name: name.into(),
            value: Some(value.into()),
            ..Default::default()
        });
        self
    }
    pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
        self.image_pull_policy = Some(policy.into());
        self
    }
    pub fn build(self) -> Deployment {
        let name = self.name.clone();
        Deployment {
            metadata: ObjectMeta {
                name: Some(name.clone()),
                labels: Some(
                    [
                        ("app.kubernetes.io/name".to_string(), name.clone()),
                        (
                            "app.kubernetes.io/component".to_string(),
                            "deployment".to_string(),
                        ),
                        (
                            "app.kubernetes.io/managed-by".to_string(),
                            "harmony".to_string(),
                        ),
                        ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
                    ]
                    .into(),
                ),
                ..Default::default()
            },
            spec: Some(DeploymentSpec {
                replicas: Some(self.replicas),
                selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
                    match_labels: Some(
                        [("app.kubernetes.io/name".to_string(), name.clone())].into(),
                    ),
                    ..Default::default()
                },
                template: PodTemplateSpec {
                    metadata: Some(ObjectMeta {
                        labels: Some(
                            [
                                ("app.kubernetes.io/name".to_string(), name.clone()),
                                ("app.kubernetes.io/instance".to_string(), name.clone()),
                            ]
                            .into(),
                        ),
                        ..Default::default()
                    }),
                    spec: Some(PodSpec {
                        containers: vec![Container {
                            name: name.clone(),
                            image: Some(self.image),
                            image_pull_policy: self.image_pull_policy,
                            ports: if self.container_ports.is_empty() {
                                None
                            } else {
                                Some(self.container_ports)
                            },
                            env: if self.env_vars.is_empty() {
                                None
                            } else {
                                Some(self.env_vars)
                            },
                            ..Default::default()
                        }],
                        ..Default::default()
                    }),
                },
                ..Default::default()
            }),
            ..Default::default()
        }
    }
 }
 /// Helper function to create a Service from network port configuration.
 /// Returns `None` if no ports are provided.
 pub fn create_service_from_ports(
    name: String,
    network_ports: &[ApplicationNetworkPort],
 ) -> Option<K8sService> {
    if network_ports.is_empty() {
        return None;
    }
    let ports: Vec<ServicePort> = network_ports
        .into_iter()
        .map(|port| ServicePort {
            name: Some(port.name.clone()),
            protocol: Some(port.protocol.to_string()),
            port: port.number as i32,
            target_port: Some(IntOrString::Int(port.number as i32)),
            ..Default::default()
        })
        .collect();
    Some(K8sService {
        metadata: ObjectMeta {
            name: Some(name.clone()),
            labels: Some(
                [
                    ("app.kubernetes.io/name".to_string(), name.clone()),
                    (
                        "app.kubernetes.io/component".to_string(),
                        "service".to_string(),
                    ),
                    (
                        "app.kubernetes.io/managed-by".to_string(),
                        "harmony".to_string(),
                    ),
                ]
                .into(),
            ),
            ..Default::default()
        },
        spec: Some(ServiceSpec {
            type_: Some("ClusterIP".to_string()),
            selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
            ports: Some(ports),
            ..Default::default()
        }),
        ..Default::default()
    })
 }
--- a/harmony/src/modules/application/mod.rs
+++ b/harmony/src/modules/application/mod.rs
@@ -1,5 +1,8 @@
 pub mod backend_app;
 pub mod config;
 mod feature;
 pub mod features;
 pub mod helm;
 pub mod oci;
 mod rust;
 mod webapp;
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
        todo!()
    }
 }
 /// Checks the output of a process command for success.
 fn check_output(
    output: &std::process::Output,
    msg: &str,
 ) -> Result<(), Box<dyn std::error::Error>> {
    if !output.status.success() {
        let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
        return Err(error_message.into());
    }
    Ok(())
 }
--- a/harmony/src/modules/application/oci.rs
+++ b/harmony/src/modules/application/oci.rs
@@ -1,5 +1,13 @@
 use std::path::{Path, PathBuf};
 use crate::{
    config::{REGISTRY_PROJECT, REGISTRY_URL},
    modules::application::check_output,
 };
 use super::Application;
 use async_trait::async_trait;
 use log::debug;
 #[async_trait]
 pub trait OCICompliant: Application {
@@ -17,9 +25,74 @@ pub trait HelmPackage: Application {
    /// # Arguments
    /// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
    /// * `domain` - The domain where the application is hosted.
-    async fn build_push_helm_package(
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
-        &self,
+
-        image_url: &str,
+    fn project_root(&self) -> PathBuf;
-        domain: &str,
+
-    ) -> Result<String, String>;
+    fn chart_name(&self) -> String;
    /// Packages a Helm chart directory into a .tgz file.
    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
        debug!(
            "Launching `helm package {}` cli with CWD {}",
            chart_dirname.to_string_lossy(),
            &self
                .project_root()
                .join(".harmony_generated")
                .join("helm")
                .to_string_lossy()
        );
        let output = std::process::Command::new("helm")
            .args(["package", chart_dirname.to_str().unwrap()])
            .current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
            .output()?;
        check_output(&output, "Failed to package Helm chart")?;
        // Helm prints the path of the created chart to stdout.
        let tgz_name = String::from_utf8(output.stdout)?
            .split_whitespace()
            .last()
            .unwrap_or_default()
            .to_string();
        if tgz_name.is_empty() {
            return Err("Could not determine packaged chart filename.".into());
        }
        // The output from helm is relative, so we join it with the execution directory.
        Ok(self
            .project_root()
            .join(".harmony_generated")
            .join("helm")
            .join(tgz_name))
    }
    /// Pushes a packaged Helm chart to an OCI registry.
    fn push_helm_chart(
        &self,
        packaged_chart_path: &Path,
    ) -> Result<String, Box<dyn std::error::Error>> {
        // The chart name is the file stem of the .tgz file
        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
        debug!(
            "Pushing Helm chart {} to {}",
            packaged_chart_path.to_string_lossy(),
            oci_push_url
        );
        let output = std::process::Command::new("helm")
            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
            .output()?;
        check_output(&output, "Pushing Helm chart failed")?;
        // The final URL includes the version tag, which is part of the file name
        let version = chart_file_name.rsplit_once('-').unwrap().1;
        debug!("pull url {oci_pull_url}");
        debug!("push url {oci_push_url}");
        Ok(format!("{}:{}", oci_pull_url, version))
    }
 }
--- a/harmony/src/modules/application/rust.rs
+++ b/harmony/src/modules/application/rust.rs
@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {
 #[async_trait]
 impl HelmPackage for RustWebapp {
-    async fn build_push_helm_package(
+    fn project_root(&self) -> PathBuf {
-        &self,
+        self.project_root.clone()
-        image_url: &str,
+    }
-        domain: &str,
+
-    ) -> Result<String, String> {
+    fn chart_name(&self) -> String {
        self.name.clone()
    }
    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
        let domain = self.dns();
        info!("Starting Helm chart build and push for '{}'", self.name);
        // 1. Create the Helm chart files on disk.
        let chart_dir = self
-            .create_helm_chart_files(image_url, domain)
+            .create_helm_chart_files(image_url, &domain)
            .await
            .map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
        info!("Successfully created Helm chart files in {:?}", chart_dir);
@@ -327,19 +332,6 @@ impl RustWebapp {
        Ok(image_tag.to_string())
    }
    /// Checks the output of a process command for success.
    fn check_output(
        &self,
        output: &process::Output,
        msg: &str,
    ) -> Result<(), Box<dyn std::error::Error>> {
        if !output.status.success() {
            let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
            return Err(error_message.into());
        }
        Ok(())
    }
    fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
        match self.framework {
            Some(RustWebFramework::Leptos) => {
@@ -640,71 +632,6 @@ spec:
        Ok(chart_dir)
    }
    /// Packages a Helm chart directory into a .tgz file.
    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
        debug!(
            "Launching `helm package {}` cli with CWD {}",
            chart_dirname.to_string_lossy(),
            &self
                .project_root
                .join(".harmony_generated")
                .join("helm")
                .to_string_lossy()
        );
        let output = process::Command::new("helm")
            .args(["package", chart_dirname.to_str().unwrap()])
            .current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
            .output()?;
        self.check_output(&output, "Failed to package Helm chart")?;
        // Helm prints the path of the created chart to stdout.
        let tgz_name = String::from_utf8(output.stdout)?
            .split_whitespace()
            .last()
            .unwrap_or_default()
            .to_string();
        if tgz_name.is_empty() {
            return Err("Could not determine packaged chart filename.".into());
        }
        // The output from helm is relative, so we join it with the execution directory.
        Ok(self
            .project_root
            .join(".harmony_generated")
            .join("helm")
            .join(tgz_name))
    }
    /// Pushes a packaged Helm chart to an OCI registry.
    fn push_helm_chart(
        &self,
        packaged_chart_path: &Path,
    ) -> Result<String, Box<dyn std::error::Error>> {
        // The chart name is the file stem of the .tgz file
        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
        debug!(
            "Pushing Helm chart {} to {}",
            packaged_chart_path.to_string_lossy(),
            oci_push_url
        );
        let output = process::Command::new("helm")
            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
            .output()?;
        self.check_output(&output, "Pushing Helm chart failed")?;
        // The final URL includes the version tag, which is part of the file name
        let version = chart_file_name.rsplit_once('-').unwrap().1;
        debug!("pull url {oci_pull_url}");
        debug!("push url {oci_push_url}");
        Ok(format!("{}:{}", oci_pull_url, version))
    }
    fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
        let existing_dockerfile = self.project_root.join("Dockerfile");
--- a/harmony/templates/helm/Chart.yaml.j2
+++ b/harmony/templates/helm/Chart.yaml.j2
@@ -0,0 +1,6 @@
 apiVersion: v2
 name: {{ name }}
 description: {{ description }}
 type: application
 version: {{ version }}
 appVersion: "{{ app_version }}"
--- a/harmony_agent/.dockerignore
+++ b/harmony_agent/.dockerignore
@@ -0,0 +1,4 @@
 .git
 data
 target
 demos
--- a/harmony_agent/Cargo.toml
+++ b/harmony_agent/Cargo.toml
@@ -0,0 +1,26 @@
 [package]
 name = "harmony_agent"
 edition = "2024"
 version.workspace = true
 readme.workspace = true
 license.workspace = true
 [dependencies]
 harmony = { path = "../harmony" }
 # harmony_cli = { path = "../harmony_cli" }
 harmony_types = { path = "../harmony_types" }
 harmony_macros = { path = "../harmony_macros" }
 cidr = { workspace = true }
 tokio = { workspace = true }
 log = { workspace = true }
 env_logger = { workspace = true }
 async-nats = "0.45.0"
 async-trait = "0.1"
 # url = { workspace = true }
 serde.workspace = true
 serde_json.workspace = true
 getrandom = "0.3.4"
 thiserror.workspace = true
 pretty_assertions.workspace = true
--- a/harmony_agent/Dockerfile
+++ b/harmony_agent/Dockerfile
@@ -0,0 +1,44 @@
 # Build stage
 FROM rust:slim AS builder
 # Install build dependencies
 RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Copy all required packages
 COPY . .
 RUN ls -la1
 # Build the application in release mode
 RUN cargo build --release -p harmony_agent
 # Runtime stage
 FROM debian:bookworm-slim
 # Install runtime dependencies
 RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Copy the binary from the builder stage
 COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
 # Declare environment variables used by the Harmony Agent
 # These will be set from build-time environment variables if present
 # NATS_URL: URL of the NATS server (default: nats://localhost:4222)
 ARG NATS_URL=nats://localhost:4222
 ENV NATS_URL=${NATS_URL}
 # NATS_CREDS_PATH: Optional path to NATS credentials file
 ARG NATS_CREDS_PATH
 ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
 # MY_CLUSTER_ID: This cluster's unique identifier (required)
 ARG MY_CLUSTER_ID
 ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
 # DESIRED_PRIMARY: The ID of the desired primary cluster (required)
 ARG DESIRED_PRIMARY
 ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
 # Run the application
 ENTRYPOINT ["./harmony_agent"]
--- a/harmony_agent/README.md
+++ b/harmony_agent/README.md
@@ -0,0 +1,248 @@
 TODO
 DONE:
 1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
 2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
 3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
 4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
 5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
 6. ✅ failover_timeout added to AgentConfig
 7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
 8. ✅ startup reconciliation implemented via on_startup() method
 REMAINING:
 - review all code and list implementation issues
 - review both workflow for each state transition
 - Complete replica workflow staleness detection (needs implementation in Watching state)
 - Implement state recovery from Failed state for both workflows
 - Implement subscribe in NATS store with watch() API
 - Implement config validation for failover_timeout constraints
 TODO
 1. store trait subscribe definition missing callback
 2. BUG, data integrity issue : nats store not actually using jetstream metadata
 3. review all code and list implementation issues
 4. review both workflow for each state transition
 5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
 6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
   should hold it probably, every agent should be subscribed to the cluster_state object and
   keep it in memory to allow workflows to process against it efficiently)
 ## CRITICAL - Data Integrity Issues
 1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
   - Currently uses `put()` which overwrites unconditionally
   - Must use `update()` with revision parameter for proper compare-and-set
   - Without this, concurrent promotion attempts can cause split brain
 2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
   - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
   - NATS Entry has `.revision` and `.created` fields that must be used
   - This defeats the entire purpose of store-provided timestamps
 3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
   - Line ~156: TODO comment confirms missing metadata passing
   - Replica cannot calculate staleness without metadata.timestamp
   - Failover logic is broken
 4. **No actual cluster state watching exists**
   - Replica workflow declares `ClusterState` but never updates it
   - No subscription to primary heartbeat or cluster_state key
   - Replica cannot detect primary liveness
 ## HIGH - Missing Core Functionality
 5. **Replica Workflow incomplete** - All key logic is TODO:
   - Watching primary staleness (line 114)
   - Promotion attempt (line 118)
   - Original primary recovery detection (line 127)
   - Demotion/handshake (line 131)
 6. **Missing replica "Failed" state**
   - `ReplicaState` enum has no `Failed` variant
   - User's TODO #5 correctly identifies this gap
   - What happens if replica's own heartbeats fail repeatedly?
 7. **Primary Workflow incomplete** - Key logic missing:
   - No NATS check before recovering from `Fenced` state (line 95)
   - No NATS check in `Yielding` state for demotion handshake (line 101)
   - No actual fencing failure handling
 8. **Store `subscribe` not implemented** (`store/mod.rs`)
   - Returns `todo!()` in NATS implementation
   - No callback mechanism defined in trait
   - Without this, agents cannot react to state changes
 9. **Cluster state not tracked centrally**
   - User's TODO #6 correctly identifies this
   - Each agent should maintain a local copy of cluster_state
   - No subscription mechanism to update this local copy
 10. **No validation of configuration constraints**
    - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
    - Invalid config could cause split brain
 ## MEDIUM - Incorrect State Transitions
 11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
    - Two state transitions happen in one heartbeat cycle
    - Should stay in `Failed` until fencing actually completes
    - What if fencing fails? State machine won't reflect it
 12. **No fencing failure handling**
    - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
    - ADR mentions escalating to radical measures, but no callback for failure
 13. **Replica `Watching` state does nothing**
    - Line 115: Just logs, checks nothing
    - Should be checking staleness of primary heartbeat
 14. **Demotion handshake not implemented**
    - ADR section 4 details this but code doesn't implement it
    - How does original primary know it should yield?
 ## LOW - Observability & Reliability
 15. **No graceful shutdown mechanism**
    - `run_heartbeat_loop` runs forever
    - No signal handling (SIGTERM, SIGINT)
 16. **Async task errors silently ignored**
    - `tokio::spawn` at lines 74, 83, 123
    - No `JoinHandle` retention or error handling
 17. **No metrics/observability**
    - Only log output
    - No Prometheus metrics for state transitions, failure counts, etc.
 18. **Hardcoded main() function** (`agent_loop.rs::main`)
    - Not production-ready entry point
    - Should load config from environment or file
 19. **Store factory pattern missing**
    - TODO comment at line 54 confirms this
    - Can't switch between stores via config
 20. **No backoff/retry logic for NATS operations**
    - Transient failures could trigger unnecessary fencing
 21. **`AgentInfo` status is hardcoded to "HEALTHY"**
    - Line 137 in `store_heartbeat`
    - Should反映 actual workflow state
 22. **Unused fields in structs**
    - `HeartbeatState.last_seq` set but never read
    - `ClusterState.current_primary` set but never read
 ## ADR-017-3 Compliance Issues
 23. **ADR violation: Clock skew not avoided**
    - While ADR says use store metadata, code uses local time
 24. **Failover timeout not configurable**
    - Defined in ADR but not in `AgentConfig`
    - Needed for replica staleness calculation
 25. **Safety margin concept exists in ADR but not in code**
    - Configuration should include this margin
 26. **No handling of Case 3 (Replica Network Lag)**
    - ADR describes NATS rejection prevention
    - But `set_strict` implementation accepts any write
 ## Code Quality Issues
 27. **Inconsistent error handling**
    - Some paths return `Err`, others `todo!()`, others ignore
 28. **Unnecessary `Clone` bounds**
    - `DeploymentConfig.clone()` used frequently
    - Could be optimized with `Arc`
 29. **Missing lifetime annotations**
    - `KvStore::get` returns `String` key in error - inefficient
 30. **No integration points mentioned**
    - PostgreSQL lifecycle control implementation missing
    - Fencing via CNPG not connected
 ## Production Readiness Checklist Summary
 For battle testing preparation, you need:
 **Immediate ( blockers):**
 - Fix NATS store metadata usage (issues #1, #2)
 - Implement strict set_strict with actual CAS (#1)
 - Implement replica primary watching (#4, #5)
 - Add failover_timeout config + staleness logic (#3, #24)
 - Implement subscribe mechanism with callbacks (#8)
 **High priority:**
 - Complete all workflow transitions (#5, #7, #11-14)
 - Add cluster state tracking (#6, #9)
 - Add configuration validation (#10)
 - Add Replica Failed state (#6)
 **Before deployment:**
 - Implement graceful shutdown (#15)
 - Add error handling for spawned tasks (#16)
 - Remove hardcoded main function (#18)
 - Implement store factory (#19)
 - Add Prometheus metrics (#17)
 **Documentation:**
 - Document all configuration parameters and their trade-offs
 - Add runbooks for each failure mode
 - Document battle test scenarios to cover
 ### Addendum: Missing Critical Issues
 #### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
 *   **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
 *   **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
 *   **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
 *   **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
 #### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
 *   **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
 *   **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
 *   **Scenario:**
    1.  Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
    2.  Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
    3.  `on_active` finishes *before* `on_failover`.
    4.  `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
 *   **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
 #### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
 *   **Location:** `agent_loop.rs` loop logic.
 *   **The Bug:** There is no "Stop the World" gate.
 *   **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
 *   **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
 *   **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
 #### 4. HIGH: NATS Bucket Name Collision
 *   **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
 *   **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
 *   **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
 *   **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
 #### 5. HIGH: Startup State Reconciliation
 *   **Location:** `HarmonyAgent::new`.
 *   **The Bug:** Agents always start in `Initializing`.
 *   **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
 *   **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
 *   **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
 ### Summary of Tasks to Add
 Please add these to your master list before starting implementation:
 28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
 29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
 30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
 *   **Think about vacuum / stop-the-world operations**
--- a/harmony_agent/deploy/Cargo.toml
+++ b/harmony_agent/deploy/Cargo.toml
@@ -0,0 +1,20 @@
 [package]
 name = "harmony_agent_deploy"
 edition = "2024"
 version.workspace = true
 readme.workspace = true
 license.workspace = true
 [dependencies]
 harmony = { path = "../../harmony" }
 harmony_cli = { path = "../../harmony_cli" }
 harmony_types = { path = "../../harmony_types" }
 harmony_macros = { path = "../../harmony_macros" }
 cidr = { workspace = true }
 tokio = { workspace = true }
 log = { workspace = true }
 env_logger = { workspace = true }
 url = { workspace = true }
 serde.workspace = true
 serde_json.workspace = true
--- a/harmony_agent/deploy/src/main.rs
+++ b/harmony_agent/deploy/src/main.rs
@@ -0,0 +1,63 @@
 use harmony::{
    inventory::Inventory,
    modules::{
        application::{
            ApplicationScore,
            backend_app::{BackendApp, BuildCommand},
            features::{Monitoring, PackagingDeployment},
        },
        monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
    },
    topology::K8sAnywhereTopology,
 };
 use harmony_macros::hurl;
 use harmony_types::k8s_name::K8sName;
 use std::{path::PathBuf, sync::Arc};
 #[tokio::main]
 async fn main() {
    let application = Arc::new(BackendApp {
        name: "harmony-agent".to_string(),
        // Since harmony_agent is part of the harmony workspace, the actual "project root"
        // is not harmony_agent folder but the workspace root.
        //
        // So using ../ here means we MUST run this deployment script from the harmony_agent
        // folder
        project_root: PathBuf::from("../"),
        network_ports: vec![],
        env_vars: vec![
            ("NATS_URL".to_string(), "nats://nats".to_string()),
            ("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
            ("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
            ("NATS_CREDS_PATH".to_string(), "".to_string()),
        ],
        build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
        dockerfile: Some(PathBuf::from("Dockerfile")),
    });
    let app = ApplicationScore {
        features: vec![
            Box::new(PackagingDeployment {
                application: application.clone(),
            }),
            Box::new(Monitoring {
                application: application.clone(),
                alert_receiver: vec![Box::new(DiscordWebhook {
                    name: K8sName("test-discord".to_string()),
                    url: hurl!("https://discord.doesnt.exist.com"),
                    selectors: vec![],
                })],
            }),
        ],
        application,
    };
    harmony_cli::run(
        Inventory::autoload(),
        K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
        vec![Box::new(app)],
        None,
    )
    .await
    .unwrap();
 }
--- a/harmony_agent/src/agent/config.rs
+++ b/harmony_agent/src/agent/config.rs
@@ -0,0 +1,79 @@
 use std::time::Duration;
 use harmony_types::id::Id;
 use log::info;
 use super::heartbeat::HeartbeatFailure;
 use super::role::AgentRole;
 #[derive(Debug, Clone)]
 pub struct AgentConfig {
    /// Number of consecutive successful heartbeats required before the service transitions from
    /// failed to healthy.
    pub success_threshold: usize,
    /// Number of consecutive failed heartbeats required before the service transitions from
    /// healthy to failed.
    pub failure_threshold: usize,
    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
    /// considered failed.
    pub heartbeat_interval: Duration,
    /// Time since last observed primary heartbeat before replica considers primary stale.
    /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
    /// to avoid split brain during network partitions.
    pub failover_timeout: Duration,
    /// **UNSTABLE FIELD**
    ///
    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
    /// footprint is low, but managing multiple deployments in a single instance would be a
    /// significant resource usage reduction.
    ///
    /// Decoupling the deployment of the agent with the application's deployment could make things
    /// more complicated though, where we would have to be careful about version compatibility
    /// between all components managed by the agent instance. So for now it is a 1-1 map.
    ///
    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
    /// reader.
    pub deployment_config_unstable: DeploymentConfig,
    pub nats_url: String,
    pub nats_creds_path: Option<String>,
    pub agent_id: Id,
    pub cluster_id: Id,
    pub desired_primary_id: Id,
    /// The role this agent plays (Primary or Replica)
    pub role: AgentRole,
 }
 #[derive(Debug, Clone)]
 pub enum DeploymentConfig {
    FailoverPostgreSQL(FailoverCNPGConfig),
 }
 #[derive(Debug, Clone)]
 pub struct FailoverCNPGConfig {
    pub cnpg_cluster_name: String,
 }
 impl DeploymentConfig {
    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
    pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
        match self {
            DeploymentConfig::FailoverPostgreSQL(cfg) => {
                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
                // TODO: Implement actual PG check / NATS write here
                Ok(())
            }
        }
    }
    /// Callback: Transitioned from Unhealthy -> Healthy
    pub async fn on_active(&self) {
        info!("Service is now ACTIVE (Healthy)");
        // e.g., Remove fencing lock
    }
    /// Callback: Transitioned from Healthy -> Unhealthy
    pub async fn on_failover(&self) {
        info!("Service is now FAILED (Unhealthy)");
        // e.g., Initiate self-fencing, stop accepting traffic
    }
 }
--- a/harmony_agent/src/agent/heartbeat.rs
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -0,0 +1,35 @@
 use harmony_types::id::Id;
 use serde::{Deserialize, Serialize};
 use crate::store::KvMetadata;
 /// Agent-provided heartbeat information (no timestamps - those come from the store)
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct AgentInfo {
    pub agent_id: Id,
    pub cluster_id: Id,
    pub status: String,
 }
 /// Complete heartbeat with both agent data and store metadata
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct AgentHeartbeat {
    pub agent_info: AgentInfo,
    pub metadata: Option<KvMetadata>,
 }
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ClusterStateData {
    pub cluster_info: ClusterState,
    pub metadata: Option<KvMetadata>,
 }
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ClusterState {
    pub cluster_id: Id,
    pub current_primary: Option<Id>,
    pub desired_primary: Id,
 }
 #[derive(Debug)]
 pub struct HeartbeatFailure {}
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -0,0 +1,507 @@
 use std::time::{SystemTime, UNIX_EPOCH};
 use std::{str::FromStr, sync::Arc, time::Duration};
 use harmony_types::id::Id;
 use log::{debug, error, info, trace, warn};
 use tokio::sync::RwLock;
 use tokio::time::{Instant, sleep};
 use crate::agent::heartbeat::ClusterState;
 use crate::store::{KvMetadata, KvStore, KvStoreError};
 use crate::workflow::HeartbeatWorkflow;
 use crate::workflow::primary::PrimaryWorkflow;
 use crate::workflow::replica::ReplicaWorkflow;
 // Submodules
 mod config;
 pub mod heartbeat;
 mod role;
 // Re-exports for backwards compatibility
 pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
 pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
 pub use role::AgentRole;
 pub async fn launch_agent<S>(
    role: AgentRole,
    health_kv: Arc<S>,
    cluster_kv: Arc<S>,
    heartbeat_interval: Duration,
    failover_timeout: Duration,
 ) -> Result<(), Box<dyn std::error::Error>>
 where
    S: KvStore + Send + Sync + 'static,
 {
    // Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
    // one so they don't crash because of the race
    match role {
        AgentRole::Primary => {}
        AgentRole::Replica => {
            sleep(Duration::from_millis(100)).await;
        }
    }
    let my_agent_name = format!("agent-{}", role);
    let my_agent_id = Id::from_str(&my_agent_name).unwrap();
    let config = AgentConfig {
        role,
        success_threshold: 2,
        failure_threshold: 2,
        heartbeat_interval,
        failover_timeout,
        deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
            cnpg_cluster_name: String::from("cnpg_cluster_name"),
        }),
        nats_url: String::new(),
        nats_creds_path: None,
        agent_id: my_agent_id,
        cluster_id: "cluster_test_id".into(),
        desired_primary_id: "primary_id".into(),
    };
    log::info!("Harmony Agent Initialized");
    log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
    log::info!("Full config : {:?}", config);
    // TODO load store based on config, default to nats
    // probably a good use case for a factory pattern
    let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
    agent.reconcile_startup().await?;
    // Run the heartbeat loop
    agent.run_heartbeat_loop().await;
    Ok(())
 }
 pub struct HarmonyAgent<S: KvStore> {
    pub config: AgentConfig,
    workflow: Box<dyn HeartbeatWorkflow>,
    health_kv: Arc<S>,
    cluster_kv: Arc<S>,
    /// Last successful heartbeat, used to track sequence number for next write
    /// This avoids doing a GET before every SET, reducing network round-trips
    last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
    /// Local copy of cluster state, updated via subscription
    /// This allows workflows to make decisions without querying NATS each time
    cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
 }
 impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
    pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
        let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
            AgentRole::Primary => {
                info!("Initializing agent as PRIMARY");
                Box::new(PrimaryWorkflow::new(
                    config.success_threshold,
                    config.failure_threshold,
                    config.deployment_config_unstable.clone(),
                ))
            }
            AgentRole::Replica => {
                info!("Initializing agent as REPLICA");
                Box::new(ReplicaWorkflow::new(
                    config.success_threshold,
                    config.failure_threshold,
                    config.cluster_id.clone(),
                    config.desired_primary_id.clone(),
                    config.agent_id.clone(),
                    config.failover_timeout,
                ))
            }
        };
        Self {
            config,
            workflow,
            health_kv,
            cluster_kv,
            last_heartbeat: Arc::new(RwLock::new(None)),
            cluster_state: Arc::new(RwLock::new(None)),
        }
    }
    /// Generic helper to fetch and deserialize data from KV store
    /// Returns Ok(Some(data)) if key exists and deserializes successfully
    /// Returns Ok(None) if key doesn't exist
    /// Returns Err if deserialization fails or other errors occur
    async fn fetch_from_store<D>(
        &self,
        store: &Arc<S>,
        key: &str,
    ) -> Result<Option<(D, KvMetadata)>, KvStoreError>
    where
        D: serde::de::DeserializeOwned,
    {
        debug!("Fetching data from key: {}", key);
        let result = store.get(key).await;
        debug!("Got result from store: {:#?}", result);
        match result {
            Ok(kv_result) => {
                if let Some(value) = kv_result.value {
                    match serde_json::from_value::<D>(value.clone()) {
                        Ok(data) => Ok(Some((data, kv_result.metadata))),
                        Err(e) => {
                            log::warn!("Failed to deserialize data from key {}: {}", key, e);
                            Err(KvStoreError::DeserializationFailed {
                                deserialization_error: format!(
                                    "Key exists but deserialization failed for {key}: {e}"
                                ),
                                value: value.to_string(),
                            })
                        }
                    }
                } else {
                    Err(KvStoreError::Unknown(format!(
                        "Key exists but value is empty for {key}, this should not happen"
                    )))
                }
            }
            Err(KvStoreError::KeyNotAvailable(_)) => {
                debug!("Key {} not found in store", key);
                Ok(None)
            }
            Err(e) => {
                log::warn!("Failed to fetch data from key {}: {}", key, e);
                Err(e)
            }
        }
    }
    /// Reconcile startup state by fetching cluster state and heartbeat from the store
    /// This allows the workflow to determine if it should resume as Primary/Replica
    /// based on the persisted cluster state
    pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
        let cluster_key = format!("cluster.{}", self.config.cluster_id);
        debug!(
            "Fetching cluster state for startup reconciliation from key: {}",
            cluster_key
        );
        let cluster_state_option = match self
            .fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
            .await?
        {
            Some((data, metadata)) => Some(ClusterStateData {
                cluster_info: data,
                metadata: Some(metadata),
            }),
            None => {
                debug!(
                    "Cluster state key not found, this is a fresh cluster, initializing cluster state"
                );
                Some(self.store_cluster_state(None).await?)
            }
        };
        debug!("Found cluster state {cluster_state_option:#?}");
        self.workflow
            .on_startup(cluster_state_option.as_ref(), &self.config)
            .await;
        // Cache the cluster state locally
        *self.cluster_state.write().await = cluster_state_option;
        // Fetch last heartbeat if it exists to avoid sequence conflicts
        let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
        debug!("Fetching last heartbeat from key: {}", heartbeat_key);
        let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
        let last_heartbeat = match last_heartbeat_option {
            Ok(kv_result) => {
                let value = kv_result
                    .value
                    .expect("When key exist it should always contain data");
                Some(AgentHeartbeat {
                    agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
                        |e| KvStoreError::DeserializationFailed {
                            deserialization_error: e.to_string(),
                            value: value.to_string(),
                        },
                    )?,
                    metadata: Some(kv_result.metadata),
                })
            }
            Err(e) => match e {
                KvStoreError::KeyNotAvailable(_) => None,
                _ => return Err(e),
            },
        };
        if let Some(heartbeat) = &last_heartbeat {
            debug!(
                "Found existing heartbeat with sequence: {}",
                heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
            );
        } else {
            debug!("No existing heartbeat found, starting fresh");
        }
        // Cache the last heartbeat for sequence tracking
        *self.last_heartbeat.write().await = last_heartbeat;
        Ok(())
    }
    async fn store_cluster_state(
        &self,
        cluster_data: Option<ClusterStateData>,
    ) -> Result<ClusterStateData, KvStoreError> {
        let key = format!("cluster.{}", self.config.cluster_id);
        match cluster_data {
            Some(cluster_data) => {
                debug!("found some cluster state {:#?}", cluster_data);
                let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
                    KvStoreError::DeserializationFailed {
                        deserialization_error: e.to_string(),
                        value: format!("{:?}", cluster_data),
                    }
                })?;
                let expected_sequence = {
                    let last = self.cluster_state.read().await;
                    last.as_ref()
                        .and_then(|hb| hb.metadata.as_ref())
                        .map(|m| m.sequence)
                        .unwrap_or(0)
                };
                debug!("expected sequence {:#?}", expected_sequence);
                let new_seq = self
                    .cluster_kv
                    .set_strict(&key, value, expected_sequence)
                    .await?;
                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
                debug!("cluster kv {:#?}", cluster_kv_result);
                let cluster_data_new = ClusterStateData {
                    cluster_info: cluster_data.cluster_info.clone(),
                    metadata: Some(cluster_kv_result.metadata),
                };
                *self.cluster_state.write().await = Some(cluster_data_new.clone());
                Ok(cluster_data)
            }
            None => {
                let cluster_info = ClusterState {
                    cluster_id: self.config.cluster_id.clone(),
                    current_primary: None,
                    desired_primary: self.config.desired_primary_id.clone(),
                };
                let value = serde_json::to_value(&cluster_info).map_err(|e| {
                    KvStoreError::DeserializationFailed {
                        deserialization_error: e.to_string(),
                        value: format!("{:?}", cluster_info),
                    }
                })?;
                let cluster_data = ClusterStateData {
                    cluster_info,
                    metadata: None,
                };
                let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
                debug!("cluster kv {:#?}", cluster_kv_result);
                let cluster_data_new = ClusterStateData {
                    cluster_info: cluster_data.cluster_info.clone(),
                    metadata: Some(cluster_kv_result.metadata),
                };
                *self.cluster_state.write().await = Some(cluster_data_new.clone());
                Ok(cluster_data_new)
            }
        }
    }
    /// Sends agent heartbeat to the KV store
    ///
    /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
    /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
    /// comparisons use the store's clock, not agent clocks.
    ///
    /// This method uses the last successful heartbeat's sequence number to avoid an extra
    /// GET call before each SET, reducing network round-trips and latency exposure.
    async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
        let key = format!("heartbeat.{}", self.config.agent_id);
        // Create agent info WITHOUT timestamp - the store will add metadata
        // Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
        let agent_info = AgentInfo {
            agent_id: self.config.agent_id.clone(),
            cluster_id: self.config.cluster_id.clone(),
            status: self.workflow.state_name().to_string(),
        };
        debug!("Storing heartbeat for agent: {}", self.config.agent_id);
        let value =
            serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
                deserialization_error: e.to_string(),
                value: format!("{:?}", agent_info),
            })?;
        let expected_sequence = {
            let last = self.last_heartbeat.read().await;
            last.as_ref()
                .and_then(|hb| hb.metadata.as_ref())
                .map(|m| m.sequence)
                .unwrap_or(0)
        };
        trace!("Writing new heartbeat  {key} (#{expected_sequence}), value: {value:?}");
        let new_seq = self
            .health_kv
            .set_strict(&key, value, expected_sequence)
            .await?;
        trace!("Got new sequence {new_seq}");
        let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
        debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
        // Construct complete heartbeat with metadata from store
        let heartbeat = AgentHeartbeat {
            agent_info,
            metadata: Some(kv_result.metadata),
        };
        // Cache this successful heartbeat for next iteration
        *self.last_heartbeat.write().await = Some(heartbeat.clone());
        Ok(heartbeat)
    }
    pub async fn run_heartbeat_loop(&mut self) {
        let mut next_heartbeat_start;
        loop {
            let this_heartbeat_start = Instant::now();
            next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
            // Perform the check via the config/strategy with a timeout
            //
            // FIXME There is too much stuff happening inside the timeout. There are some things like a
            // promotion, that we don't want to cancel within a single heartbeat interval timeout
            // I think that the timeout should only apply to the store_heartbeat().await call.
            // Logic happening after should not be affected in the exact same manner. There can be
            // other timeouts or other stuff to consider here.
            // However, the system does rely on heartbeats happening regularly, so we do not want
            // to delay the next heartbeat either. This is tricky.
            // An idea right now is to keep the heartbeat running but, when a processing event
            // occurs, set a flag on the local agent that there is a process running (promotion,
            // demotion, etc) and take no other decision until this process is not done. There is
            // one exception we can think of right now :
            // - a healthy primary starts running a process such as "calling mom"
            // - the primary keeps sending its heartbeat to prove to the rest of the cluster that
            // it is still healthy
            // - then the primary heartbeat fails up to failure_threshold
            // - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
            //  - Another solution would be register the processing: "calling mom" in the primary
            //  heartbeat store, and prevent the replica from promoting when there is a running
            //  task on the primary.
            let result = tokio::time::timeout(self.config.heartbeat_interval, async {
                // Store heartbeat and perform deployment-specific health check
                match &self.store_heartbeat().await {
                    Ok(heartbeat) => {
                        // Heartbeat stored successfully, already cached by store_heartbeat
                        debug!(
                            "Heartbeat stored: seq={}",
                            heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
                        );
                    }
                    Err(KvStoreError::WrongLastRevision) => {
                        todo!("fetch and update correct last sequence number")
                        // CAS failure could indicate:
                        // 1. Network latency: our previous timeout heartbeat actually succeeded
                        // 2. Agent ID conflict: another agent with same ID exists
                        // 3. Clock/bucket corruption (unlikely)
                        // log::warn!(
                        //     "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
                        //     self.config.agent_id, expected, current, current
                        // );
                        // // Update cached heartbeat sequence to prevent repeated failures
                        // if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
                        //     if let Some(metadata) = hb.metadata.as_mut() {
                        //         metadata.sequence = *current;
                        //     }
                        // }
                    }
                    Err(e) => {
                        // Actual storage failure - treat as heartbeat failure
                        log::error!("Heartbeat storage error: {}", e);
                        return Err(HeartbeatFailure {});
                    }
                }
                self.config
                    .deployment_config_unstable
                    .perform_heartbeat()
                    .await?;
                // TODO: Pass the heartbeat with metadata to the workflow for staleness checks
                // The workflow needs access to metadata.timestamp for failover timeout calculations
                Ok::<(), HeartbeatFailure>(())
            })
            .await;
            // Update Counters & Handle State Transitions
            // Timeout is also treated as a failure
            let heartbeat_result = match result {
                Ok(inner_result) => inner_result,
                Err(_) => Err(HeartbeatFailure {}),
            };
            trace!("Got heartbeat_result : {heartbeat_result:?}");
            match heartbeat_result {
                Ok(_) => {
                    let new_state = self
                        .workflow
                        .handle_heartbeat_success(
                            self.cluster_state.read().await.as_ref(),
                            &self.config,
                        )
                        .await;
                    if let Some(new_state) = new_state {
                        warn!("Got new cluster state : {new_state:#?}");
                        self.store_cluster_state(Some(new_state))
                            .await
                            .expect(&format!("cluster state not able to be stored"));
                    }
                }
                Err(_) => {
                    self.workflow
                        .handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
                        .await;
                }
            }
            info!(
                "Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
                success_threshold = self.config.success_threshold,
                failure_threshold = self.config.failure_threshold,
                state = self.workflow.state_name(),
                consecutive_successes = self.workflow.consecutive_successes(),
                consecutive_failures = self.workflow.consecutive_failures(),
                heartbeat_emoji = if heartbeat_result.is_ok() {
                    "✅"
                } else {
                    "❌"
                },
                heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
            );
            debug!(
                "Sleeping for {} ms before next heartbeat",
                (next_heartbeat_start - Instant::now()).as_millis()
            );
            tokio::time::sleep_until(next_heartbeat_start).await;
        }
    }
 }
--- a/harmony_agent/src/agent/role.rs
+++ b/harmony_agent/src/agent/role.rs
@@ -0,0 +1,17 @@
 use std::fmt;
 /// The role of this agent instance
 #[derive(Debug, Clone, PartialEq)]
 pub enum AgentRole {
    Primary,
    Replica,
 }
 impl fmt::Display for AgentRole {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            AgentRole::Primary => write!(f, "primary"),
            AgentRole::Replica => write!(f, "replica"),
        }
    }
 }
--- a/harmony_agent/src/config.rs
+++ b/harmony_agent/src/config.rs
@@ -0,0 +1,90 @@
 use harmony_types::id::Id;
 use log::debug;
 use std::env;
 use std::path::Path;
 use std::time::Duration;
 /// Configuration for the Harmony Agent
 #[derive(Debug, Clone)]
 pub struct AgentConfig {
    pub nats_url: String,
    pub nats_creds_path: Option<String>,
    pub my_cluster_id: Id,
    pub desired_primary: Id,
    pub heartbeat_interval: Duration,
 }
 pub const NATS_URL: &str = "NATS_URL";
 pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
 pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
 pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
 impl AgentConfig {
    pub fn load_from_env() -> Result<Self, String> {
        let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
        // Validate NATS URL is not empty
        if nats_url.is_empty() {
            return Err(format!("{NATS_URL} cannot be empty"));
        }
        // Validate NATS URL format
        if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
            return Err(format!(
                "Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
                nats_url
            ));
        }
        let nats_creds_path = env::var(NATS_CREDS_PATH)
            .ok()
            .filter(|creds_path| !creds_path.is_empty());
        // Validate NATS creds path if provided
        if let Some(creds_path) = &nats_creds_path {
            debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
            let path = Path::new(creds_path);
            if !path.exists() {
                return Err(format!(
                    "NATS credentials file does not exist: {}",
                    creds_path
                ));
            }
            if !path.is_file() {
                return Err(format!(
                    "NATS credentials path is not a file: {}",
                    creds_path
                ));
            }
            // Check if file is readable by attempting to read metadata
            if std::fs::metadata(path).is_err() {
                return Err(format!(
                    "NATS credentials file is not readable: {}",
                    creds_path
                ));
            }
        }
        let my_cluster_id_str = env::var(MY_CLUSTER_ID)
            .map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
        if my_cluster_id_str.is_empty() {
            return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
        }
        let desired_primary_str = env::var(DESIRED_PRIMARY)
            .map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
        if desired_primary_str.is_empty() {
            return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
        }
        Ok(Self {
            nats_url,
            nats_creds_path,
            my_cluster_id: my_cluster_id_str.into(),
            desired_primary: desired_primary_str.into(),
            heartbeat_interval: Duration::from_millis(1000),
        })
    }
 }
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -0,0 +1,82 @@
 use std::{sync::Arc, time::Duration};
 use crate::{
    agent::AgentRole,
    store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
 };
 // mod agent_loop;
 mod agent;
 pub mod store;
 mod workflow;
 #[tokio::main]
 async fn main() {
    env_logger::init();
    let heartbeat_interval = Duration::from_millis(2000);
    let failover_timeout = Duration::from_secs(10);
    // let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
    let nats_store = get_local_nats_store().await;
    let health_kv = nats_store.clone();
    let cluster_kv = nats_store.clone();
    let _ = tokio::join!(
        agent::launch_agent(
            AgentRole::Primary,
            health_kv.clone(),
            cluster_kv.clone(),
            heartbeat_interval,
            failover_timeout
        ),
        agent::launch_agent(
            AgentRole::Replica,
            health_kv,
            cluster_kv,
            heartbeat_interval,
            failover_timeout
        ),
    );
 }
 fn get_chaos_store(
    heartbeat_interval: &Duration,
    failover_timeout: &Duration,
 ) -> (
    Arc<ChaosKvStore<InMemoryKvStore>>,
    Arc<ChaosKvStore<InMemoryKvStore>>,
 ) {
    let health_kv = Arc::new(ChaosKvStore::new(
        InMemoryKvStore::new(),
        10,
        10,
        heartbeat_interval.as_millis().try_into().unwrap(),
    ));
    let cluster_kv = Arc::new(ChaosKvStore::new(
        InMemoryKvStore::new(),
        5,
        5,
        failover_timeout.as_millis().try_into().unwrap(),
    ));
    (health_kv, cluster_kv)
 }
 async fn get_local_nats_store() -> Arc<NatsKvStore> {
    let client = async_nats::connect("localhost").await.unwrap();
    let jetstream = async_nats::jetstream::new(client);
    let kv = jetstream
        .create_key_value(async_nats::jetstream::kv::Config {
            bucket: "kv".to_string(),
            history: 10,
            ..Default::default()
        })
        .await
        .unwrap();
    let status = kv.status().await.unwrap();
    println!("status: {:?}", status);
    Arc::new(NatsKvStore::new(kv))
 }
--- a/harmony_agent/src/store/chaos.rs
+++ b/harmony_agent/src/store/chaos.rs
@@ -0,0 +1,142 @@
 use async_trait::async_trait;
 use log::{debug, trace, warn};
 use serde_json::Value;
 use std::sync::Arc;
 use tokio::time::Duration;
 use crate::store::SubscriptionCallback;
 use super::{KvStore, KvStoreError};
 /// A chaos testing KV store that randomly times out or fails
 /// Wraps another KvStore implementation and adds random failures
 #[derive(Clone)]
 pub struct ChaosKvStore<T: KvStore> {
    inner: Arc<T>,
    timeout_probability_percent: u32,
    failure_probability_percent: u32,
    max_delay_ms: u64,
 }
 impl<T: KvStore> ChaosKvStore<T> {
    pub fn new(
        inner: T,
        timeout_probability_percent: u32,
        failure_probability_percent: u32,
        max_delay_ms: u64,
    ) -> Self {
        Self {
            inner: Arc::new(inner),
            timeout_probability_percent,
            failure_probability_percent,
            max_delay_ms,
        }
    }
    async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
        trace!("Calculating chaos");
        // Random delay
        let delay = getrandom::u64().unwrap() % self.max_delay_ms;
        let delay = Duration::from_millis(delay);
        trace!("Sleeping until chaos maybe happens {delay:?}");
        tokio::time::sleep(delay).await;
        // Random failure
        let failure_random = getrandom::u32().unwrap() % 100;
        if failure_random < self.failure_probability_percent {
            warn!(
                "Chaos causes an error : {failure_random} < {}",
                self.failure_probability_percent
            );
            return Err(KvStoreError::Unknown(format!(
                "Randomly failed thanks to chaos store with {}% chances, got {}",
                self.failure_probability_percent, failure_random
            )));
        }
        // Random timeout (simulated as a very long delay)
        let failure_random = getrandom::u32().unwrap() % 100;
        if failure_random < self.timeout_probability_percent {
            warn!(
                "Chaos caused a timeout : {failure_random} < {}",
                self.failure_probability_percent
            );
            tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
        }
        Ok(())
    }
 }
 #[async_trait]
 impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
    async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
        self.maybe_chaos().await?;
        self.inner.get(key).await
    }
    async fn get_revision(
        &self,
        key: &str,
        expected_seq: u64,
    ) -> Result<super::KvResult, KvStoreError> {
        self.maybe_chaos().await?;
        self.inner.get_revision(key, expected_seq).await
    }
    async fn set_strict(
        &self,
        key: &str,
        value: Value,
        expected_sequence: u64,
    ) -> Result<u64, KvStoreError> {
        self.maybe_chaos().await?;
        self.inner.set_strict(key, value, expected_sequence).await
    }
    async fn subscribe(
        &self,
        key: &str,
        callback: SubscriptionCallback,
    ) -> Result<(), KvStoreError> {
        self.maybe_chaos().await?;
        self.inner.subscribe(key, callback).await
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::store::InMemoryKvStore;
    use serde_json::json;
    #[tokio::test]
    async fn test_chaos_store_with_no_chaos() {
        let inner = InMemoryKvStore::new();
        let chaos = ChaosKvStore::new(inner, 0, 0, 1);
        let value = json!({"test": "value"});
        let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
        assert_eq!(result, 1);
        let retrieved = chaos.get("key").await.unwrap();
        assert_eq!(retrieved.value, Some(value));
    }
    #[tokio::test]
    async fn test_chaos_store_with_delay() {
        let inner = InMemoryKvStore::new();
        let chaos = ChaosKvStore::new(inner, 0, 0, 100);
        let start = tokio::time::Instant::now();
        let value = json!({"test": "value"});
        chaos.set_strict("key", value, 0).await.unwrap();
        let elapsed = start.elapsed();
        // Should have some delay
        assert!(
            elapsed.as_millis() < 150,
            "Should complete within reasonable time"
        );
    }
 }
--- a/harmony_agent/src/store/memory.rs
+++ b/harmony_agent/src/store/memory.rs
@@ -0,0 +1,196 @@
 use async_trait::async_trait;
 use log::{debug, trace};
 use serde_json::Value;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{SystemTime, UNIX_EPOCH};
 use tokio::sync::RwLock;
 use crate::store::SubscriptionCallback;
 use super::{KvMetadata, KvResult, KvStore, KvStoreError};
 /// An in-memory KV store that guarantees ordering like NATS JetStream
 /// Each key maintains a full history of all writes, where the sequence number
 /// is the length of the history (1-indexed)
 #[derive(Clone)]
 pub struct InMemoryKvStore {
    data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
 }
 impl InMemoryKvStore {
    pub fn new() -> Self {
        Self {
            data: Arc::new(RwLock::new(HashMap::new())),
        }
    }
    /// Get the latest sequence number for a key (length of history)
    pub async fn get_seq(&self, key: &str) -> Option<u64> {
        self.data.read().await.get(key).map(|vec| vec.len() as u64)
    }
    /// Get the value at a specific revision for a key
    pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
        let data = self.data.read().await;
        let entries = data
            .get(key)
            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
        // Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
        if seq == 0 || seq > entries.len() as u64 {
            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
        }
        let (value, timestamp) = entries[seq as usize - 1].clone();
        Ok(KvResult {
            value: Some(value.clone()),
            metadata: KvMetadata {
                timestamp,
                sequence: seq,
            },
        })
    }
 }
 impl Default for InMemoryKvStore {
    fn default() -> Self {
        Self::new()
    }
 }
 #[async_trait]
 impl KvStore for InMemoryKvStore {
    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
        self.get_revision(key, expected_seq).await
    }
    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
        let data = self.data.read().await;
        let entries = data
            .get(key)
            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
        let (value, timestamp) = entries.last().unwrap();
        Ok(KvResult {
            value: Some(value.clone()),
            metadata: KvMetadata {
                timestamp: *timestamp,
                sequence: entries.len() as u64,
            },
        })
    }
    async fn set_strict(
        &self,
        key: &str,
        value: Value,
        expected_sequence: u64,
    ) -> Result<u64, KvStoreError> {
        // Check current sequence (length of history for this key)
        let data = self.data.read().await;
        // This implemenetation does not seem to match the NATS sequence. In nats the
        // sequence updates one counter per bucket. This impl creates a counter per key
        let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
        drop(data);
        // Verify expected sequence matches
        if current_sequence != expected_sequence {
            trace!("{current_sequence} != {expected_sequence}");
            return Err(KvStoreError::WrongLastRevision);
        }
        // Get current timestamp
        let timestamp = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .expect("Time went backwards")
            .as_millis() as u64;
        // Append to the history
        let mut data = self.data.write().await;
        data.entry(key.to_string())
            .or_insert_with(Vec::new)
            .push((value.clone(), timestamp));
        let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
        debug!(
            "Successfully inserted {key}(rev#{new_seq}) : {value}",
            value = value.to_string()
        );
        Ok(new_seq)
    }
    async fn subscribe(
        &self,
        key: &str,
        callback: SubscriptionCallback,
    ) -> Result<(), KvStoreError> {
        // For now, subscribe just returns the current value
        // In a real implementation, this would return a stream of updates
        self.get(key).await;
        todo!() // register callback and call it when key is set ?
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use serde_json::json;
    #[tokio::test]
    async fn test_memory_store_basic() {
        let store = InMemoryKvStore::new();
        // Set a value
        let value = json!({"status": "healthy"});
        let result = store
            .set_strict("test_key", value.clone(), 0)
            .await
            .unwrap();
        assert_eq!(result, 1);
        // Get the value
        let retrieved = store.get("test_key").await.unwrap();
        assert_eq!(retrieved.value, Some(value));
        assert_eq!(retrieved.metadata.sequence, 1);
    }
    #[tokio::test]
    async fn test_memory_store_sequence_numbers() {
        let store = InMemoryKvStore::new();
        let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
        let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
        assert!(seq2 > seq1, "Sequence numbers should increment");
    }
    #[tokio::test]
    async fn test_memory_store_key_not_found() {
        let store = InMemoryKvStore::new();
        let result = store.get("nonexistent").await;
        assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
    }
    #[tokio::test]
    async fn test_memory_store_strict_ordering() {
        let store = InMemoryKvStore::new();
        // First write with sequence 0
        let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
        assert_eq!(result1, 1);
        // Second write with correct sequence
        let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
        assert_eq!(result2, 2);
        // Third write with wrong sequence should fail
        let result3 = store.set_strict("key", json!("value3"), 1).await;
        assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
    }
 }
--- a/harmony_agent/src/store/mod.rs
+++ b/harmony_agent/src/store/mod.rs
@@ -0,0 +1,120 @@
 use async_trait::async_trait;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use thiserror::Error;
 /// Handle for managing active subscriptions
 #[derive(Debug, Clone)]
 pub struct SubscriptionHandle {
    id: usize,
    _phantom: std::marker::PhantomData<()>,
 }
 /// Metadata returned by the KV store for all operations
 /// Contains timing and ordering information set by the store
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct KvMetadata {
    /// Timestamp set by the store (milliseconds since UNIX epoch)
    pub timestamp: u64,
    /// Sequence number for strict ordering guarantees
    pub sequence: u64,
 }
 /// Result returned by KV store operations
 /// Contains both the value (if any) and store metadata
 #[derive(Debug, Clone)]
 pub struct KvResult {
    /// The value from the store (None if key doesn't exist)
    pub value: Option<Value>,
    /// Store-provided metadata (timestamp, sequence)
    pub metadata: KvMetadata,
 }
 /// Callback type for subscription updates
 /// Callback receives: key, new value (None if deleted), and metadata
 pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
 #[derive(Error, Debug)]
 pub enum KvStoreError {
    #[error("data store disconnected")]
    Disconnect(#[from] std::io::Error),
    #[error("invalid key")]
    InvalidKey,
    #[error("operation timed out")]
    Timeout,
    #[error("the data for key `{0}` is not available")]
    KeyNotAvailable(String),
    #[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
    DeserializationFailed {
        deserialization_error: String,
        value: String,
    },
    #[error("Strict ordering violation, wrong last sequence number")]
    WrongLastRevision,
    #[error("unknown data store error {0}")]
    Unknown(String),
 }
 #[async_trait]
 pub trait KvStore {
    /// Get a value from the store
    ///
    /// # Returns
    /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
    /// - `Err(KeyNotAvailable)`: If the key doesn't exist
    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
    /// Strict set operation with compare-and-set semantics
    ///
    /// Sets the value only if the current sequence number matches `expected_sequence`.
    /// This provides strict ordering guarantees needed for the failover algorithm.
    ///
    /// # Parameters
    /// - `key`: The key to set
    /// - `value`: The value to store
    /// - `expected_sequence`: The sequence number we expect the key to currently have.
    ///   Use 0 for the first write to a new key.
    ///
    /// # Returns
    /// - `Ok(u64)`: Returns the new sequence number
    /// - `Err(KvStoreError)`: If another write happened (current != expected)
    ///
    /// # Example Use Case
    /// For NATS JetStream, this maps to the conditional update operation that ensures
    /// only one agent can successfully promote to primary.
    async fn set_strict(
        &self,
        key: &str,
        value: Value,
        expected_sequence: u64,
    ) -> Result<u64, KvStoreError>;
    /// Subscribe to updates for a key
    ///
    /// # Parameters
    /// - `key`: The key to subscribe to
    /// - `callback`: Function to call on each update with key, value, and metadata
    ///
    /// # Returns
    /// - `Ok(())`: Subscription established successfully
    /// - `Err(KvStoreError)`: Subscription failed
    ///
    /// Note: For JetStream, this should use watch() API. Updates will invoke the callback
    /// asynchronously in the background.
    async fn subscribe(
        &self,
        key: &str,
        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
                                        // callback
    ) -> Result<(), KvStoreError>;
 }
 mod chaos;
 mod memory;
 mod nats;
 pub use chaos::ChaosKvStore;
 pub use memory::InMemoryKvStore;
 pub use nats::NatsKvStore;
--- a/harmony_agent/src/store/nats.rs
+++ b/harmony_agent/src/store/nats.rs
@@ -0,0 +1,179 @@
 use async_nats::jetstream::kv::{Store, UpdateError};
 use async_trait::async_trait;
 use log::{debug, error, trace};
 use serde_json::Value;
 use crate::store::SubscriptionCallback;
 use super::{KvMetadata, KvResult, KvStore, KvStoreError};
 /// NATS JetStream-backed KV store
 pub struct NatsKvStore {
    store: Store,
 }
 impl NatsKvStore {
    pub fn new(store: Store) -> Self {
        Self { store }
    }
    pub async fn create(
        client: async_nats::Client,
        bucket_name: &str,
        history_size: i64,
    ) -> Result<Self, Box<dyn std::error::Error>> {
        let jetstream = async_nats::jetstream::new(client);
        debug!("Creating NATS KV bucket: {}", bucket_name);
        let store = jetstream
            .create_key_value(async_nats::jetstream::kv::Config {
                bucket: bucket_name.to_string(),
                history: history_size,
                ..Default::default()
            })
            .await
            .map_err(|e| {
                error!(
                    "Failed to initialize NATS KV bucket '{}': {}",
                    bucket_name, e
                );
                e
            })?;
        Ok(Self::new(store))
    }
 }
 #[async_trait]
 impl KvStore for NatsKvStore {
    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
        let entry = self
            .store
            .entry_for_revision(key, expected_seq)
            .await
            .map_err(|e| {
                error!("NATS get failed for key '{}': {}", key, e);
                KvStoreError::Disconnect(std::io::Error::new(
                    std::io::ErrorKind::Other,
                    e.to_string(),
                ))
            })?;
        if entry.is_none() {
            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
        }
        let entry = entry.unwrap();
        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
            KvStoreError::DeserializationFailed {
                deserialization_error: e.to_string(),
                value: String::from_utf8_lossy(&entry.value).to_string(),
            }
        })?;
        // Extract metadata from NATS entry
        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
        let metadata = KvMetadata {
            timestamp,
            sequence: entry.revision,
        };
        Ok(KvResult {
            value: Some(value),
            metadata,
        })
    }
    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
        let entry = self.store.entry(key).await.map_err(|e| {
            error!("NATS get failed for key '{}': {}", key, e);
            KvStoreError::Disconnect(std::io::Error::new(
                std::io::ErrorKind::Other,
                e.to_string(),
            ))
        })?;
        if entry.is_none() {
            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
        }
        let entry = entry.unwrap();
        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
            KvStoreError::DeserializationFailed {
                deserialization_error: e.to_string(),
                value: String::from_utf8_lossy(&entry.value).to_string(),
            }
        })?;
        // Extract metadata from NATS entry
        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
        let metadata = KvMetadata {
            timestamp,
            sequence: entry.revision,
        };
        Ok(KvResult {
            value: Some(value),
            metadata,
        })
    }
    async fn set_strict(
        &self,
        key: &str,
        value: Value,
        expected_sequence: u64,
    ) -> Result<u64, KvStoreError> {
        trace!(
            "Nats set strict {key} (#{expected_sequence}) : {}",
            value.to_string()
        );
        let bytes =
            serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
                deserialization_error: e.to_string(),
                value: value.to_string(),
            })?;
        // Use update() for CAS semantics (Compare-And-Set)
        // This ensures we only write if the revision matches expected_sequence
        let revision = self
            .store
            .update(&key, bytes.into(), expected_sequence)
            .await
            .map_err(|e| {
                // FIXME this is ugly, we should have a clean KvStoreError containing
                // proper information from nats instead
                error!("NATS update failed for key '{}': {}", key, e);
                e
            })?;
        Ok(revision)
    }
    async fn subscribe(
        &self,
        key: &str,
        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
    ) -> Result<(), KvStoreError> {
        todo!()
    }
 }
 impl From<UpdateError> for KvStoreError {
    fn from(value: UpdateError) -> Self {
        match value.kind() {
            async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
            async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
            async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
                KvStoreError::WrongLastRevision
            }
            async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
                std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
            ),
        }
    }
 }
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -0,0 +1,39 @@
 use std::sync::Arc;
 use crate::agent::AgentConfig;
 use async_trait::async_trait;
 pub mod primary;
 pub mod replica;
 /// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
 #[async_trait]
 pub trait HeartbeatWorkflow: Send + Sync {
    /// Handle a successful heartbeat
    async fn handle_heartbeat_success(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
        agent_config: &AgentConfig,
    ) -> Option<crate::agent::ClusterStateData>;
    /// Handle a failed heartbeat
    async fn handle_heartbeat_failure(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
    );
    async fn on_startup(
        &self,
        cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
        agent_config: &AgentConfig,
    );
    /// Get the current state name for logging (also used for heartbeat status)
    fn state_name(&self) -> &'static str;
    /// Get current consecutive successes
    fn consecutive_successes(&self) -> usize;
    /// Get current consecutive failures
    fn consecutive_failures(&self) -> usize;
 }
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -0,0 +1,330 @@
 use async_trait::async_trait;
 use log::{debug, info, trace, warn};
 use crate::{
    agent::{AgentConfig, DeploymentConfig},
    workflow::HeartbeatWorkflow,
 };
 #[derive(Debug, Clone, PartialEq)]
 pub enum PrimaryState {
    Initializing,
    Healthy,
    Failed,
    Fenced,
    Yielding,
 }
 impl PrimaryState {
    pub fn name(&self) -> &'static str {
        match self {
            PrimaryState::Initializing => "Primary:Initializing",
            PrimaryState::Healthy => "Primary:Healthy",
            PrimaryState::Failed => "Primary:Failed",
            PrimaryState::Fenced => "Primary:Fenced",
            PrimaryState::Yielding => "Primary:Yielding",
        }
    }
 }
 pub struct PrimaryWorkflow {
    state: PrimaryState,
    consecutive_successes: usize,
    consecutive_failures: usize,
    // TODO these thresholds should not be copied into the workflow struct. They are configuration
    // level and should always be read from the context passed to the workflow functions
    success_threshold: usize,
    failure_threshold: usize,
    // TODO not sure if this should be known by the workflow or passed in the context to function
    // calls or just completely handled by the agent ?
    deployment_config: DeploymentConfig,
 }
 impl PrimaryWorkflow {
    pub fn new(
        success_threshold: usize,
        failure_threshold: usize,
        deployment_config: DeploymentConfig,
    ) -> Self {
        Self {
            state: PrimaryState::Initializing,
            consecutive_successes: 0,
            consecutive_failures: 0,
            success_threshold,
            failure_threshold,
            deployment_config,
        }
    }
    fn transition_to(&mut self, new_state: PrimaryState) {
        if self.state != new_state {
            info!(
                "State transition: {} -> {}",
                self.state.name(),
                new_state.name()
            );
            self.state = new_state;
        }
    }
 }
 #[async_trait]
 impl HeartbeatWorkflow for PrimaryWorkflow {
    async fn on_startup(
        &self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
        agent_config: &AgentConfig,
    ) {
        if let Some(state) = cluster_state {
            info!(
                "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
                state.cluster_info.current_primary, state.cluster_info.desired_primary
            );
            // No automatic fast-tracking - agent must earn healthy status
            // through successful heartbeats. This prevents duplicate agents
            // or crashloop agents from incorrectly claiming primary.
        } else {
            debug!("No cluster state on startup, starting from Initializing");
        }
    }
    async fn handle_heartbeat_success(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
        agent_config: &AgentConfig,
    ) -> Option<crate::agent::ClusterStateData> {
        trace!(
            "Handling heartbeat success, current counters success {} failures {}",
            self.consecutive_successes, self.consecutive_failures
        );
        self.consecutive_successes += 1;
        self.consecutive_failures = 0;
        match self.state {
            PrimaryState::Initializing => {
                if self.consecutive_successes >= self.success_threshold {
                    self.transition_to(PrimaryState::Healthy);
                    // Trigger on_active callback
                    let config = self.deployment_config.clone();
                    tokio::spawn(async move {
                        config.on_active().await;
                    });
                    if let Some(state) = cluster_state
                        && state.cluster_info.desired_primary == agent_config.desired_primary_id
                    {
                        debug!("state {:#?}", state);
                        let mut new_state = state.clone();
                        new_state.cluster_info.current_primary =
                            Some(agent_config.agent_id.clone());
                        return Some(new_state);
                    } else {
                        todo!(
                            "I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
                        );
                    }
                }
                None
            }
            PrimaryState::Failed => {
                if self.consecutive_successes >= self.success_threshold {
                    self.transition_to(PrimaryState::Healthy);
                    let config = self.deployment_config.clone();
                    tokio::spawn(async move {
                        config.on_active().await;
                    });
                }
                todo!()
            }
            PrimaryState::Healthy => {
                // Stay healthy
                debug!("Primary staying healthy");
                None
            }
            PrimaryState::Fenced => {
                // Recovery from fenced state
                if self.consecutive_successes >= self.success_threshold {
                    // TODO: Check NATS for current_primary status before recovering
                    info!("Recovered from fenced state, transitioning to yielding");
                    self.transition_to(PrimaryState::Yielding);
                }
                todo!()
            }
            PrimaryState::Yielding => {
                // TODO: Check NATS to see if we can resume as primary
                trace!("Yielding, waiting for demotion handshake");
                todo!()
            }
        }
    }
    async fn handle_heartbeat_failure(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
    ) {
        self.consecutive_failures += 1;
        self.consecutive_successes = 0;
        match self.state {
            PrimaryState::Healthy => {
                if self.consecutive_failures >= self.failure_threshold {
                    warn!(
                        "Failure threshold reached ({}/{}), transitioning to Failed",
                        self.consecutive_failures, self.failure_threshold
                    );
                    self.transition_to(PrimaryState::Failed);
                    // Immediately fence
                    self.transition_to(PrimaryState::Fenced);
                    let config = self.deployment_config.clone();
                    tokio::spawn(async move {
                        config.on_failover().await;
                    });
                }
            }
            PrimaryState::Initializing => {
                // Stay in initializing, just accumulate failures
                trace!("Heartbeat failed during initialization");
            }
            PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
                // Already in a degraded state
                trace!("Heartbeat failed in degraded state: {}", self.state.name());
            }
        }
    }
    fn state_name(&self) -> &'static str {
        self.state.name()
    }
    fn consecutive_successes(&self) -> usize {
        self.consecutive_successes
    }
    fn consecutive_failures(&self) -> usize {
        self.consecutive_failures
    }
 }
 #[cfg(test)]
 mod test {
    use harmony_types::id::Id;
    use std::time::Duration;
    use crate::agent::{AgentRole, FailoverCNPGConfig};
    use pretty_assertions::assert_eq;
    use super::*;
    #[tokio::test]
    async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
        assert!(
            primary
                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
                .await
                .is_none()
        );
    }
    #[tokio::test]
    async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
        let mut expected_state = cluster_state.clone();
        expected_state.cluster_info.current_primary = Some(Id::empty());
        assert_eq!(
            primary
                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
                .await,
            None
        );
        assert_eq!(
            primary
                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
                .await,
            Some(expected_state)
        );
    }
    #[tokio::test]
    async fn primary_stays_healthy_below_failure_threshold() {
        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
        // Reach healthy
        let _ = primary
            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
            .await;
        assert_eq!(primary.state, PrimaryState::Healthy);
        // One failure below threshold
        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
        assert_eq!(primary.state, PrimaryState::Healthy);
        assert_eq!(primary.consecutive_failures(), 1);
        assert_eq!(primary.consecutive_successes(), 0);
    }
    #[tokio::test]
    async fn primary_transitions_to_failed_at_failure_threshold() {
        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
        // Reach healthy
        let _ = primary
            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
            .await;
        assert_eq!(primary.state, PrimaryState::Healthy);
        // First failure, still healthy
        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
        assert_eq!(primary.state, PrimaryState::Healthy);
        assert_eq!(primary.consecutive_failures(), 1);
        // Second failure reaches threshold, transitions to Failed
        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
        assert_eq!(primary.state, PrimaryState::Fenced);
        assert_eq!(primary.consecutive_failures(), 2);
        assert_eq!(primary.consecutive_successes(), 0);
    }
    fn default_test_state(
        success_threshold: usize,
        failure_threshold: usize,
    ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
        let cluster_state = crate::agent::ClusterStateData {
            cluster_info: crate::agent::heartbeat::ClusterState {
                cluster_id: Id::empty(),
                current_primary: None,
                desired_primary: Id::empty(),
            },
            metadata: None,
        };
        let agent_config = AgentConfig {
            success_threshold,
            failure_threshold,
            heartbeat_interval: Duration::from_nanos(0),
            failover_timeout: Duration::from_nanos(0),
            deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
                cnpg_cluster_name: "test".to_string(),
            }),
            nats_url: String::new(),
            nats_creds_path: None,
            agent_id: Id::empty(),
            cluster_id: Id::empty(),
            desired_primary_id: Id::empty(),
            role: AgentRole::Primary,
        };
        let primary = PrimaryWorkflow::new(
            agent_config.success_threshold,
            agent_config.failure_threshold,
            agent_config.deployment_config_unstable.clone(),
        );
        (primary, cluster_state, agent_config)
    }
 }
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -0,0 +1,279 @@
 use async_trait::async_trait;
 use harmony_types::id::Id;
 use log::{debug, error, info, trace, warn};
 use std::time::Duration;
 use tokio::sync::RwLock;
 use crate::agent::{AgentConfig, AgentHeartbeat};
 use crate::workflow::HeartbeatWorkflow;
 #[derive(Debug, Clone)]
 pub struct HeartbeatState {
    pub agent_id: Id,
    pub last_seq: Option<u64>,
 }
 impl HeartbeatState {
    pub fn watch(agent_id: Id) -> Self {
        Self {
            agent_id,
            last_seq: None,
        }
    }
 }
 #[derive(Debug, Clone)]
 pub struct ClusterState {
    pub cluster_id: Id,
    pub current_primary: Option<Id>,
 }
 impl ClusterState {
    pub fn watch(cluster_id: Id) -> Self {
        Self {
            cluster_id,
            current_primary: None,
        }
    }
 }
 #[derive(Debug, Clone, PartialEq)]
 pub enum ReplicaState {
    Initializing,
    Watching,
    Promoting,
    PromotionFailed,
    Leader,
    Demoting,
    Failed,
 }
 impl ReplicaState {
    pub fn name(&self) -> &'static str {
        match self {
            ReplicaState::Initializing => "Replica:Initializing",
            ReplicaState::Watching => "Replica:Watching",
            ReplicaState::Promoting => "Replica:Promoting",
            ReplicaState::PromotionFailed => "Replica:PromotionFailed",
            ReplicaState::Leader => "Replica:Leader",
            ReplicaState::Demoting => "Replica:Demoting",
            ReplicaState::Failed => "Replica:Failed",
        }
    }
 }
 pub struct ReplicaWorkflow {
    state: ReplicaState,
    heartbeat_state: HeartbeatState,
    primary_state: HeartbeatState,
    cluster_state: ClusterState,
    consecutive_successes: usize,
    consecutive_failures: usize,
    success_threshold: usize,
    failure_threshold: usize,
    failover_timeout: Duration,
    /// Our own last heartbeat (for timestamp comparison against primary)
    last_my_heartbeat: Option<AgentHeartbeat>,
    /// Last observed primary heartbeat (metadata only, for staleness detection)
    last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
 }
 impl ReplicaWorkflow {
    pub fn new(
        success_threshold: usize,
        failure_threshold: usize,
        cluster_id: Id,
        primary_id: Id,
        my_id: Id,
        failover_timeout: Duration,
    ) -> Self {
        Self {
            state: ReplicaState::Initializing,
            consecutive_successes: 0,
            consecutive_failures: 0,
            success_threshold,
            failure_threshold,
            failover_timeout,
            cluster_state: ClusterState::watch(cluster_id),
            primary_state: HeartbeatState::watch(primary_id),
            heartbeat_state: HeartbeatState::watch(my_id),
            last_my_heartbeat: None,
            last_primary_heartbeat: None,
        }
    }
    fn transition_to(&mut self, new_state: ReplicaState) {
        if self.state != new_state {
            info!(
                "State transition: {} -> {}",
                self.state.name(),
                new_state.name()
            );
            self.state = new_state;
        }
    }
    /// Check if the primary heartbeat is stale compared to our own
    /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
    async fn is_primary_stale(&mut self) -> bool {
        if let Some(my_hb) = &self.last_my_heartbeat {
            if let Some(my_metadata) = &my_hb.metadata {
                if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
                    let primary_hb = primary_hb_ref.read().await;
                    if let Some(primary_metadata) = &primary_hb.metadata {
                        // Calculate time difference: replica_timestamp - primary_timestamp
                        let time_diff_ms = my_metadata
                            .timestamp
                            .saturating_sub(primary_metadata.timestamp);
                        let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
                        trace!(
                            "Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
                            my_metadata.timestamp,
                            primary_metadata.timestamp,
                            time_diff_ms,
                            failover_timeout_ms
                        );
                        if time_diff_ms > failover_timeout_ms {
                            info!(
                                "Primary heartbeat stale ({}ms > {}ms), attempting promotion",
                                time_diff_ms, failover_timeout_ms
                            );
                            return true;
                        }
                    }
                }
            }
        }
        false
    }
 }
 #[async_trait]
 impl HeartbeatWorkflow for ReplicaWorkflow {
    async fn on_startup(
        &self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
        agent_config: &AgentConfig,
    ) {
        // todo!("not sure if the replica should do anything on startup")
    }
    async fn handle_heartbeat_success(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
        agent_config: &AgentConfig,
    ) -> Option<crate::agent::ClusterStateData> {
        trace!(
            "Handling heartbeat success, current counters success {} failures {}",
            self.consecutive_successes, self.consecutive_failures
        );
        self.consecutive_successes += 1;
        self.consecutive_failures = 0;
        match self.state {
            ReplicaState::Initializing => {
                if self.consecutive_successes >= self.success_threshold {
                    self.transition_to(ReplicaState::Watching);
                }
                None
            }
            ReplicaState::Watching => {
                // TODO: Check primary staleness from NATS
                trace!("Replica watching primary");
                if self.is_primary_stale().await {
                    panic!("Found stale primary, launching promotion");
                }
                debug!("perform the replica watch actions :
                - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
                    - check the last primary heartbeat kv timestamp
                    - compare it with our latest kv heartbeat
                    - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
                    - launching promotion will change the status of the replica
                    ");
                None
            }
            ReplicaState::Promoting => {
                // TODO: Complete promotion attempt
                trace!("Replica promotion in progress");
                todo!(
                    "When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
                );
            }
            ReplicaState::PromotionFailed => {
                if self.consecutive_successes >= self.success_threshold {
                    self.transition_to(ReplicaState::Watching);
                }
                todo!()
            }
            ReplicaState::Leader => {
                // TODO: Check for original primary recovery
                trace!("Replica acting as leader");
                todo!()
            }
            ReplicaState::Failed => {
                if self.consecutive_successes >= self.success_threshold {
                    info!("Replica recovered from Failed state, transitioning to Watching");
                    self.transition_to(ReplicaState::Watching);
                }
                todo!()
            }
            ReplicaState::Demoting => {
                // TODO: Complete demotion back to watching
                trace!("Replica demotion in progress");
                todo!()
            }
        }
    }
    async fn handle_heartbeat_failure(
        &mut self,
        cluster_state: Option<&crate::agent::ClusterStateData>,
    ) {
        self.consecutive_failures += 1;
        self.consecutive_successes = 0;
        // TODO revisit this. I think we should handle the agent healthiness (checking
        // consecutive_failures against failure_threshold) separately from handling the cluster
        // state.
        //
        // That said, there might be funny stuff we have to do when the agent reaches the failure
        // threshold, especially in promoting and demoting statuses.
        match self.state {
            ReplicaState::Watching | ReplicaState::Initializing => {
                if self.consecutive_failures >= self.failure_threshold {
                    info!(
                        "Replica exceeded failure threshold ({}/{}), transitioning to Failed",
                        self.consecutive_failures, self.failure_threshold
                    );
                    self.transition_to(ReplicaState::Failed);
                } else {
                    trace!("Replica heartbeat failed, but below threshold");
                }
            }
            ReplicaState::Promoting
            | ReplicaState::PromotionFailed
            | ReplicaState::Leader
            | ReplicaState::Demoting
            | ReplicaState::Failed => {
                trace!("Replica heartbeat failed in state: {}", self.state.name());
            }
        }
    }
    fn state_name(&self) -> &'static str {
        self.state.name()
    }
    fn consecutive_successes(&self) -> usize {
        self.consecutive_successes
    }
    fn consecutive_failures(&self) -> usize {
        self.consecutive_failures
    }
 }
--- a/harmony_execution/Cargo.toml
+++ b/harmony_execution/Cargo.toml
@@ -0,0 +1,12 @@
 [package]
 name = "harmony_execution"
 edition = "2024"
 version.workspace = true
 readme.workspace = true
 license.workspace = true
 [dependencies]
 thiserror.workspace = true
 lazy_static.workspace = true
 directories.workspace = true
 log.workspace = true
--- a/harmony_execution/src/command.rs
+++ b/harmony_execution/src/command.rs
@@ -0,0 +1,470 @@
 use std::io::{BufRead, BufReader};
 use std::process::{Child, Command, Stdio};
 use std::sync::Arc;
 use std::thread;
 /// Captured output from a command execution
 #[derive(Debug, Clone)]
 pub struct CommandOutput {
    /// Captured stdout content
    pub stdout: String,
    /// Captured stderr content
    pub stderr: String,
    /// Exit status of the command
    pub status: CommandStatus,
 }
 impl CommandOutput {
    /// Returns true if the command succeeded
    pub fn is_success(&self) -> bool {
        self.status.is_success()
    }
    /// Formats the complete output for display
    pub fn format_output(&self) -> String {
        format!(
            "Stdout:\n{}\n\nStderr:\n{}",
            if self.stdout.is_empty() {
                "<empty>"
            } else {
                &self.stdout
            },
            if self.stderr.is_empty() {
                "<empty>"
            } else {
                &self.stderr
            }
        )
    }
 }
 /// Result status of a command execution
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum CommandStatus {
    /// Command executed successfully (exit code 0)
    Success,
    /// Command failed with an exit code
    Failed(i32),
    /// Command was terminated by a signal
    Terminated(i32),
    /// Command execution could not be started
    Error(String),
 }
 impl CommandStatus {
    pub fn is_success(&self) -> bool {
        matches!(self, CommandStatus::Success)
    }
 }
 impl From<std::process::ExitStatus> for CommandStatus {
    fn from(status: std::process::ExitStatus) -> Self {
        if status.success() {
            CommandStatus::Success
        } else if let Some(code) = status.code() {
            CommandStatus::Failed(code)
        } else {
            CommandStatus::Terminated(0) // Signal codes are platform-specific
        }
    }
 }
 type Callback = Arc<dyn Fn(&str) + Send + Sync>;
 /// Options for configuring command execution
 #[derive(Clone)]
 pub struct RunnerOptions {
    /// Whether to print stdout to console in real-time
    pub print_stdout: bool,
    /// Whether to print stderr to console in real-time
    pub print_stderr: bool,
    /// Optional callback for each stdout line
    pub stdout_callback: Callback,
    /// Optional callback for each stderr line
    pub stderr_callback: Callback,
 }
 impl RunnerOptions {
    fn empty_callback() -> Callback {
        Arc::new(|_| {})
    }
    /// Create default options with real-time printing enabled
    pub fn print_to_console() -> Self {
        Self {
            print_stdout: true,
            print_stderr: true,
            ..Default::default()
        }
    }
    /// Create options that capture output silently
    pub fn silent() -> Self {
        Self {
            print_stdout: false,
            print_stderr: false,
            ..Default::default()
        }
    }
    /// Set custom callbacks for stdout and stderr lines
    pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
    where
        F1: Fn(&str) + Send + Sync + 'static,
        F2: Fn(&str) + Send + Sync + 'static,
    {
        self.stdout_callback = Arc::new(stdout_callback);
        self.stderr_callback = Arc::new(stderr_callback);
        self
    }
 }
 impl Default for RunnerOptions {
    fn default() -> Self {
        Self {
            print_stdout: true,
            print_stderr: true,
            stdout_callback: Self::empty_callback(),
            stderr_callback: Self::empty_callback(),
        }
    }
 }
 /// Error type for command execution failures
 #[derive(Debug)]
 pub struct CommandError {
    /// Human-readable error description
    pub message: String,
    /// Captured output if execution started
    pub output: Option<CommandOutput>,
 }
 impl std::fmt::Display for CommandError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.message)?;
        if let Some(output) = &self.output {
            write!(f, "\n{}", output.format_output())?;
        }
        Ok(())
    }
 }
 impl std::error::Error for CommandError {}
 /// Runs a command and captures its output while streaming to console
 ///
 /// # Example
 ///
 /// ```
 /// use harmony_execution::command::{run_command, RunnerOptions};
 /// use std::process::Command;
 ///
 /// let output = run_command(
 ///     Command::new("echo").arg("hello"),
 ///     RunnerOptions::print_to_console()
 /// ).unwrap();
 /// assert!(output.is_success());
 /// assert_eq!(output.stdout, "hello\n");
 /// ```
 pub fn run_command(
    command: &mut Command,
    options: RunnerOptions,
 ) -> Result<CommandOutput, CommandError> {
    let mut child = command
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .map_err(|e| CommandError {
            message: format!("Failed to spawn command: {}", e),
            output: None,
        })?;
    let stdout = child.stdout.take().ok_or_else(|| CommandError {
        message: "Failed to capture stdout".to_string(),
        output: None,
    })?;
    let stderr = child.stderr.take().ok_or_else(|| CommandError {
        message: "Failed to capture stderr".to_string(),
        output: None,
    })?;
    let stdout_reader = BufReader::new(stdout);
    let stderr_reader = BufReader::new(stderr);
    let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
    let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
    // Spawn thread to handle stdout
    let stdout_handle = thread::spawn(move || {
        let mut output = String::new();
        for line in stdout_reader.lines() {
            match line {
                Ok(line_content) => {
                    if options.print_stdout {
                        println!("{}", line_content);
                    }
                    (options.stdout_callback)(&line_content);
                    output.push_str(&line_content);
                    output.push('\n');
                }
                Err(e) => {
                    // Silently handle read errors - corrupted data at end is common
                    log::trace!("Error reading stdout line: {}", e);
                }
            }
        }
        let _ = stdout_sender.send(output);
    });
    // Spawn thread to handle stderr
    let stderr_handle = thread::spawn(move || {
        let mut output = String::new();
        for line in stderr_reader.lines() {
            match line {
                Ok(line_content) => {
                    if options.print_stderr {
                        eprintln!("{}", line_content);
                    }
                    (options.stderr_callback)(&line_content);
                    output.push_str(&line_content);
                    output.push('\n');
                }
                Err(e) => {
                    log::trace!("Error reading stderr line: {}", e);
                }
            }
        }
        let _ = stderr_sender.send(output);
    });
    let status = child.wait().map_err(|e| CommandError {
        message: format!("Failed to wait for command process: {}", e),
        output: None,
    })?;
    let stdout_lines = stdout_handle
        .join()
        .map_err(|e| CommandError {
            message: format!("Stdout thread panicked: {:?}", e),
            output: None,
        })
        .and_then(|_| {
            stdout_receiver.recv().map_err(|e| CommandError {
                message: format!("Failed to receive stdout: {}", e),
                output: None,
            })
        })?;
    let stderr_lines = stderr_handle
        .join()
        .map_err(|e| CommandError {
            message: format!("Stderr thread panicked: {:?}", e),
            output: None,
        })
        .and_then(|_| {
            stderr_receiver.recv().map_err(|e| CommandError {
                message: format!("Failed to receive stderr: {}", e),
                output: None,
            })
        })?;
    Ok(CommandOutput {
        stdout: stdout_lines,
        stderr: stderr_lines,
        status: status.into(),
    })
 }
 /// Convenience function to run a command with default options (print to console)
 pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
    run_command(command, RunnerOptions::print_to_console())
 }
 /// Convenience function to run a command silently (capture output only)
 pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
    run_command(command, RunnerOptions::silent())
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::process::Command;
    #[test]
    fn test_simple_echo_command() {
        let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
        assert!(output.is_success());
        assert_eq!(output.stdout.trim(), "hello world");
        assert!(output.stderr.is_empty());
    }
    #[test]
    fn test_command_failure() {
        let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
        assert!(!output.is_success());
        assert_eq!(output.status, CommandStatus::Failed(42));
    }
    #[test]
    fn test_command_output_format() {
        let output = run_silent(Command::new("echo").arg("test")).unwrap();
        let formatted = output.format_output();
        assert!(formatted.contains("Stdout:"));
        assert!(formatted.contains("test"));
    }
    #[test]
    fn test_runner_options() {
        let opts = RunnerOptions::print_to_console();
        assert!(opts.print_stdout);
        assert!(opts.print_stderr);
        let opts = RunnerOptions::silent();
        assert!(!opts.print_stdout);
        assert!(!opts.print_stderr);
    }
    #[test]
    fn test_command_status_from_exit_status() {
        let output = run_silent(&mut Command::new("true")).unwrap();
        assert_eq!(output.status, CommandStatus::Success);
        let output = run_silent(&mut Command::new("false")).unwrap();
        assert_eq!(output.status, CommandStatus::Failed(1));
    }
    #[test]
    fn test_stdout_callback_receives_lines() {
        use std::sync::{Arc, Mutex};
        let captured = Arc::new(Mutex::new(Vec::new()));
        let captured_clone = Arc::clone(&captured);
        let opts = RunnerOptions::silent().with_callbacks(
            move |line| captured_clone.lock().unwrap().push(line.to_string()),
            |_| {},
        );
        run_command(Command::new("echo").arg("hello world"), opts).unwrap();
        let lines = captured.lock().unwrap();
        assert_eq!(lines.len(), 1);
        assert_eq!(lines[0], "hello world");
    }
    #[test]
    fn test_stderr_callback_receives_lines() {
        use std::sync::{Arc, Mutex};
        let captured = Arc::new(Mutex::new(Vec::new()));
        let captured_clone = Arc::clone(&captured);
        let opts = RunnerOptions::silent().with_callbacks(
            |_| {},
            move |line| captured_clone.lock().unwrap().push(line.to_string()),
        );
        run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
        let lines = captured.lock().unwrap();
        assert_eq!(lines.len(), 1);
        assert_eq!(lines[0], "error");
    }
    #[test]
    fn test_callback_and_capture_both_work() {
        use std::sync::{Arc, Mutex};
        let callback_lines = Arc::new(Mutex::new(Vec::new()));
        let callback_clone = Arc::clone(&callback_lines);
        let opts = RunnerOptions::silent().with_callbacks(
            move |line| callback_clone.lock().unwrap().push(line.to_string()),
            |_| {},
        );
        let output =
            run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
        // Verify captured output
        assert_eq!(output.stdout, "line1\nline2\nline3\n");
        // Verify callback received all lines
        let lines = callback_lines.lock().unwrap();
        assert_eq!(lines.len(), 3);
        assert_eq!(lines[0], "line1");
        assert_eq!(lines[1], "line2");
        assert_eq!(lines[2], "line3");
    }
    #[test]
    fn test_multiline_output_capture() {
        let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
        assert_eq!(output.stdout, "line1\nline2\nline3\n");
        assert!(output.stderr.trim().is_empty());
    }
    #[test]
    fn test_mixed_stdout_stderr_capture() {
        let output = run_silent(Command::new("sh").args([
            "-c",
            "echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
        ]))
        .unwrap();
        assert!(output.stdout.contains("stdout1"));
        assert!(output.stdout.contains("stdout2"));
        assert!(output.stderr.contains("stderr1"));
        assert!(output.stderr.contains("stderr2"));
    }
    #[test]
    fn test_empty_output_command() {
        let output = run_silent(&mut Command::new("true")).unwrap();
        assert!(output.stdout.is_empty());
        assert!(output.stderr.is_empty());
        assert!(output.is_success());
    }
    #[test]
    fn test_command_output_format_with_empty_streams() {
        let output = run_silent(&mut Command::new("true")).unwrap();
        let formatted = output.format_output();
        assert!(formatted.contains("Stdout:"));
        assert!(formatted.contains("<empty>"));
        assert!(formatted.contains("Stderr:"));
    }
    #[test]
    fn test_error_contains_message_and_output() {
        let error = CommandError {
            message: "Test error".to_string(),
            output: Some(CommandOutput {
                stdout: "captured stdout".to_string(),
                stderr: "captured stderr".to_string(),
                status: CommandStatus::Success,
            }),
        };
        let display = format!("{}", error);
        assert!(display.contains("Test error"));
        assert!(display.contains("captured stdout"));
        assert!(display.contains("captured stderr"));
    }
    #[test]
    fn test_error_without_output() {
        let error = CommandError {
            message: "Spawn failed".to_string(),
            output: None,
        };
        let display = format!("{}", error);
        assert!(display.contains("Spawn failed"));
        assert!(!display.contains("Stdout:"));
        assert!(!display.contains("Stderr:"));
    }
 }
--- a/harmony_execution/src/lib.rs
+++ b/harmony_execution/src/lib.rs
@@ -0,0 +1,5 @@
 pub mod command;
 pub use command::{
    CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
 };
--- a/harmony_types/src/id.rs
+++ b/harmony_types/src/id.rs
@@ -32,6 +32,14 @@ impl Id {
    }
 }
 impl Into<Id> for &str {
    fn into(self) -> Id {
        Id {
            value: self.to_string(),
        }
    }
 }
 impl FromStr for Id {
    type Err = ();